LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
305
306 if (!Subtarget.is64Bit()) {
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
329 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
330 MVT::v4i64}) {
333 }
334 if (Subtarget.hasAVX10_2_512()) {
337 }
338 if (Subtarget.is64Bit()) {
341 }
342 }
343
344 // Handle address space casts between mixed sized pointers.
347
348 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
349 if (!Subtarget.hasSSE2()) {
354 if (Subtarget.is64Bit()) {
356 // Without SSE, i64->f64 goes through memory.
358 }
359 } else if (!Subtarget.is64Bit())
361
362 // Scalar integer divide and remainder are lowered to use operations that
363 // produce two results, to match the available instructions. This exposes
364 // the two-result form to trivial CSE, which is able to combine x/y and x%y
365 // into a single instruction.
366 //
367 // Scalar integer multiply-high is also lowered to use two-result
368 // operations, to match the available instructions. However, plain multiply
369 // (low) operations are left as Legal, as there are single-result
370 // instructions for this in x86. Using the two-result multiply instructions
371 // when both high and low results are needed must be arranged by dagcombine.
372 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
379 }
380
381 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
383 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
384 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
387 }
388 if (Subtarget.is64Bit())
393
394 setOperationAction(ISD::FREM , MVT::f32 , Expand);
395 setOperationAction(ISD::FREM , MVT::f64 , Expand);
396 setOperationAction(ISD::FREM , MVT::f80 , Expand);
397 setOperationAction(ISD::FREM , MVT::f128 , Expand);
398
399 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
405 }
406
407 // Promote the i8 variants and force them on up to i32 which has a shorter
408 // encoding.
409 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
411 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
412 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
413 // promote that too.
414 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
416
417 if (!Subtarget.hasBMI()) {
418 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
420 if (Subtarget.is64Bit()) {
421 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
423 }
424 }
425
426 if (Subtarget.hasLZCNT()) {
427 // When promoting the i8 variants, force them to i32 for a shorter
428 // encoding.
429 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
431 } else {
432 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
433 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 continue;
437 }
438 }
439
442 // Special handling for half-precision floating point conversions.
443 // If we don't have F16C support, then lower half float conversions
444 // into library calls.
446 Op, MVT::f32,
447 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
448 // There's never any support for operations beyond MVT::f32.
449 setOperationAction(Op, MVT::f64, Expand);
450 setOperationAction(Op, MVT::f80, Expand);
451 setOperationAction(Op, MVT::f128, Expand);
452 }
453
454 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 }
458
459 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
460 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
461 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
462 setTruncStoreAction(VT, MVT::f16, Expand);
463 setTruncStoreAction(VT, MVT::bf16, Expand);
464
467 }
468
472 if (Subtarget.is64Bit())
474 if (Subtarget.hasPOPCNT()) {
475 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
476 // popcntw is longer to encode than popcntl and also has a false dependency
477 // on the dest that popcntl hasn't had since Cannon Lake.
478 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
479 } else {
484 }
485
487
488 if (!Subtarget.hasMOVBE())
490
491 // X86 wants to expand cmov itself.
492 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
497 }
498 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
499 if (VT == MVT::i64 && !Subtarget.is64Bit())
500 continue;
503 }
504
505 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
508
510 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
511 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
515
516 // Darwin ABI issue.
517 for (auto VT : { MVT::i32, MVT::i64 }) {
518 if (VT == MVT::i64 && !Subtarget.is64Bit())
519 continue;
526 }
527
528 // 64-bit shl, sra, srl (iff 32-bit x86)
529 for (auto VT : { MVT::i32, MVT::i64 }) {
530 if (VT == MVT::i64 && !Subtarget.is64Bit())
531 continue;
535 }
536
537 if (Subtarget.hasSSEPrefetch())
539
541
542 // Expand certain atomics
543 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
551 }
552
553 if (!Subtarget.is64Bit())
555
556 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
557 // All CPUs supporting AVX will atomically load/store aligned 128-bit
558 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
561 }
562
563 if (Subtarget.canUseCMPXCHG16B())
565
566 // FIXME - use subtarget debug flags
567 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
568 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
569 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
571 }
572
575
578
579 setOperationAction(ISD::TRAP, MVT::Other, Legal);
581 if (Subtarget.isTargetPS())
583 else
585
586 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
588 setOperationAction(ISD::VAEND , MVT::Other, Expand);
589 bool Is64Bit = Subtarget.is64Bit();
590 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
591 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
592
595
597
598 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
601
603
604 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
605 setOperationAction(ISD::FABS, VT, Action);
606 setOperationAction(ISD::FNEG, VT, Action);
608 setOperationAction(ISD::FREM, VT, Action);
609 setOperationAction(ISD::FMA, VT, Action);
610 setOperationAction(ISD::FMINNUM, VT, Action);
611 setOperationAction(ISD::FMAXNUM, VT, Action);
616 setOperationAction(ISD::FSIN, VT, Action);
617 setOperationAction(ISD::FCOS, VT, Action);
618 setOperationAction(ISD::FSINCOS, VT, Action);
619 setOperationAction(ISD::FTAN, VT, Action);
620 setOperationAction(ISD::FSQRT, VT, Action);
621 setOperationAction(ISD::FPOW, VT, Action);
622 setOperationAction(ISD::FPOWI, VT, Action);
623 setOperationAction(ISD::FLOG, VT, Action);
624 setOperationAction(ISD::FLOG2, VT, Action);
625 setOperationAction(ISD::FLOG10, VT, Action);
626 setOperationAction(ISD::FEXP, VT, Action);
627 setOperationAction(ISD::FEXP2, VT, Action);
628 setOperationAction(ISD::FEXP10, VT, Action);
629 setOperationAction(ISD::FCEIL, VT, Action);
630 setOperationAction(ISD::FFLOOR, VT, Action);
632 setOperationAction(ISD::FRINT, VT, Action);
633 setOperationAction(ISD::BR_CC, VT, Action);
634 setOperationAction(ISD::SETCC, VT, Action);
637 setOperationAction(ISD::FROUND, VT, Action);
639 setOperationAction(ISD::FTRUNC, VT, Action);
640 setOperationAction(ISD::FLDEXP, VT, Action);
641 };
642
643 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
644 // f16, f32 and f64 use SSE.
645 // Set up the FP register classes.
646 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
647 : &X86::FR16RegClass);
648 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
649 : &X86::FR32RegClass);
650 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
651 : &X86::FR64RegClass);
652
653 // Disable f32->f64 extload as we can only generate this in one instruction
654 // under optsize. So its easier to pattern match (fpext (load)) for that
655 // case instead of needing to emit 2 instructions for extload in the
656 // non-optsize case.
657 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
658
659 for (auto VT : { MVT::f32, MVT::f64 }) {
660 // Use ANDPD to simulate FABS.
662
663 // Use XORP to simulate FNEG.
665
666 // Use ANDPD and ORPD to simulate FCOPYSIGN.
668
669 // These might be better off as horizontal vector ops.
672
673 // We don't support sin/cos/fmod
677 }
678
679 // Half type will be promoted by default.
680 setF16Action(MVT::f16, Promote);
691
724
725 // Lower this to MOVMSK plus an AND.
728
729 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
730 (UseX87 || Is64Bit)) {
731 // Use SSE for f32, x87 for f64.
732 // Set up the FP register classes.
733 addRegisterClass(MVT::f32, &X86::FR32RegClass);
734 if (UseX87)
735 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
736
737 // Use ANDPS to simulate FABS.
739
740 // Use XORP to simulate FNEG.
742
743 if (UseX87)
745
746 // Use ANDPS and ORPS to simulate FCOPYSIGN.
747 if (UseX87)
750
751 // We don't support sin/cos/fmod
755
756 if (UseX87) {
757 // Always expand sin/cos functions even though x87 has an instruction.
761 }
762 } else if (UseX87) {
763 // f32 and f64 in x87.
764 // Set up the FP register classes.
765 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
766 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
767
768 for (auto VT : { MVT::f32, MVT::f64 }) {
771
772 // Always expand sin/cos functions even though x87 has an instruction.
776 }
777 }
778
779 // Expand FP32 immediates into loads from the stack, save special cases.
780 if (isTypeLegal(MVT::f32)) {
781 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
782 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
783 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
784 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
785 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
786 } else // SSE immediates.
787 addLegalFPImmediate(APFloat(+0.0f)); // xorps
788 }
789 // Expand FP64 immediates into loads from the stack, save special cases.
790 if (isTypeLegal(MVT::f64)) {
791 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
792 addLegalFPImmediate(APFloat(+0.0)); // FLD0
793 addLegalFPImmediate(APFloat(+1.0)); // FLD1
794 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
795 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
796 } else // SSE immediates.
797 addLegalFPImmediate(APFloat(+0.0)); // xorpd
798 }
799 // Support fp16 0 immediate.
800 if (isTypeLegal(MVT::f16))
801 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
802
803 // Handle constrained floating-point operations of scalar.
816
817 // We don't support FMA.
820
821 // f80 always uses X87.
822 if (UseX87) {
823 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
826 {
828 addLegalFPImmediate(TmpFlt); // FLD0
829 TmpFlt.changeSign();
830 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
831
832 bool ignored;
833 APFloat TmpFlt2(+1.0);
835 &ignored);
836 addLegalFPImmediate(TmpFlt2); // FLD1
837 TmpFlt2.changeSign();
838 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
839 }
840
841 // Always expand sin/cos functions even though x87 has an instruction.
842 // clang-format off
854 // clang-format on
855
867
868 // Handle constrained floating-point operations of scalar.
875 if (isTypeLegal(MVT::f16)) {
878 } else {
880 }
881 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
882 // as Custom.
884 }
885
886 // f128 uses xmm registers, but most operations require libcalls.
887 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
888 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
889 : &X86::VR128RegClass);
890
891 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
892
903
907
908 // clang-format off
916 // clang-format on
917 // No STRICT_FSINCOS
920
923 // We need to custom handle any FP_ROUND with an f128 input, but
924 // LegalizeDAG uses the result type to know when to run a custom handler.
925 // So we have to list all legal floating point result types here.
926 if (isTypeLegal(MVT::f32)) {
929 }
930 if (isTypeLegal(MVT::f64)) {
933 }
934 if (isTypeLegal(MVT::f80)) {
938 }
939
941
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
943 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
944 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
946 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
947 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
948 }
949
950 // Always use a library call for pow.
951 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
953 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
954 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
955
964
965 // Some FP actions are always expanded for vector types.
966 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
967 MVT::v4f32, MVT::v8f32, MVT::v16f32,
968 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
969 // clang-format off
983 // clang-format on
984 }
985
986 // First set operation action for all vector types to either promote
987 // (for widening) or expand (for scalarization). Then we will selectively
988 // turn on ones that can be effectively codegen'd.
1028 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1029 setTruncStoreAction(InnerVT, VT, Expand);
1030
1031 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1032 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1033
1034 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1035 // types, we have to deal with them whether we ask for Expansion or not.
1036 // Setting Expand causes its own optimisation problems though, so leave
1037 // them legal.
1038 if (VT.getVectorElementType() == MVT::i1)
1039 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1040
1041 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1042 // split/scalarized right now.
1043 if (VT.getVectorElementType() == MVT::f16 ||
1044 VT.getVectorElementType() == MVT::bf16)
1045 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1046 }
1047 }
1048
1049 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1050 // with -msoft-float, disable use of MMX as well.
1051 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1052 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1053 // No operations on x86mmx supported, everything uses intrinsics.
1054 }
1055
1056 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1057 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1058 : &X86::VR128RegClass);
1059
1064
1065 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1066 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1074
1075 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1076 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1078
1084 }
1085
1086 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1087 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1088 : &X86::VR128RegClass);
1089
1090 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1091 // registers cannot be used even for integer operations.
1092 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1101 : &X86::VR128RegClass);
1102
1103 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1108 }
1109
1110 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1111 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1116 }
1117
1118 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1119 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1120 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1121
1122 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1123 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1128 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1129 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1130 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1131 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1134
1135 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1136 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1137 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1138
1139 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1141 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1143
1144 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1145 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1146
1147 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1148 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1149 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1150 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1151 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1152 }
1153
1164
1169
1170 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1176
1177 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1178 // setcc all the way to isel and prefer SETGT in some isel patterns.
1181 }
1182
1183 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1184 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1189
1190 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1196 }
1197
1198 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1202
1203 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1204 continue;
1205
1208 }
1209 setF16Action(MVT::v8f16, Expand);
1210 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1213 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1214 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1215 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1217
1218 // Custom lower v2i64 and v2f64 selects.
1225
1232
1233 // Custom legalize these to avoid over promotion or custom promotion.
1234 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1239 }
1240
1245
1248
1251
1252 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1257
1262
1263 // We want to legalize this to an f64 load rather than an i64 load on
1264 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1265 // store.
1266 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1267 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1268 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1269 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1270 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1272
1273 // Add 32-bit vector stores to help vectorization opportunities.
1274 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1276
1280 if (!Subtarget.hasAVX512())
1282
1286
1288
1305
1306 // In the customized shift lowering, the legal v4i32/v2i64 cases
1307 // in AVX2 will be recognized.
1308 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1312 if (VT == MVT::v2i64) continue;
1317 }
1318
1324 }
1325
1326 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1331
1332 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1334 }
1335 }
1336
1337 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1338 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1339 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1340 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1341
1342 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1345 }
1346
1347 // These might be better off as horizontal vector ops.
1352 }
1353
1354 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1355 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1358 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1362 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1368
1370 }
1371
1372 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1373 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1375 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1376 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1377 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1378 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1379 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1380
1384
1385 // FIXME: Do we need to handle scalar-to-vector here?
1386 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1387 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1388
1389 // We directly match byte blends in the backend as they match the VSELECT
1390 // condition form.
1392
1393 // SSE41 brings specific instructions for doing vector sign extend even in
1394 // cases where we don't have SRA.
1395 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1398 }
1399
1400 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1401 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1402 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1406 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1407 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1408 }
1409
1410 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1411 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1412 // do the pre and post work in the vector domain.
1415 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1416 // so that DAG combine doesn't try to turn it into uint_to_fp.
1419 }
1420 }
1421
1422 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1424 }
1425
1426 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1427 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1428 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1431 }
1432
1433 // XOP can efficiently perform BITREVERSE with VPPERM.
1434 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1436 }
1437
1438 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1439 bool HasInt256 = Subtarget.hasInt256();
1440
1441 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1454 : &X86::VR256RegClass);
1455
1456 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1469
1471
1475
1481 }
1482
1483 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1484 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1485
1486 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1487 // even though v8i16 is a legal type.
1488 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1490 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1491 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1495
1502
1514
1515 if (!Subtarget.hasAVX512())
1517
1518 // In the customized shift lowering, the legal v8i32/v4i64 cases
1519 // in AVX2 will be recognized.
1520 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1526 if (VT == MVT::v4i64) continue;
1531 }
1532
1533 // These types need custom splitting if their input is a 128-bit vector.
1538
1542 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1543 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1546
1547 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1551 }
1552
1557
1558 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1563
1564 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1565 // setcc all the way to isel and prefer SETGT in some isel patterns.
1568 }
1569
1570 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1571 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1576
1577 if (Subtarget.hasAnyFMA()) {
1578 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1579 MVT::v2f64, MVT::v4f64 }) {
1582 }
1583 }
1584
1585 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1586 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1587 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1588 }
1589
1590 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1591 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1592 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1593 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1594
1595 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1596 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1597 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1598 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1600 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1601 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1602 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1603
1604 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1605 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1606
1607 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1610 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1611 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1612
1613 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1619 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1620 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1625
1626 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1627 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1632 }
1633
1634 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1637 }
1638
1639 if (HasInt256) {
1640 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1641 // when we have a 256bit-wide blend with immediate.
1644
1645 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1646 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1647 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1651 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1652 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1653 }
1654 }
1655
1656 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1657 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1658 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1660 }
1661
1662 // Extract subvector is special because the value type
1663 // (result) is 128-bit but the source is 256-bit wide.
1664 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1665 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1667 }
1668
1669 // Custom lower several nodes for 256-bit types.
1670 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1671 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1681 }
1682 setF16Action(MVT::v16f16, Expand);
1683 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1686 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1688 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1689 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1690
1691 if (HasInt256) {
1693
1694 // Custom legalize 2x32 to get a little better code.
1697
1698 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1699 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1701 }
1702 }
1703
1704 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1705 Subtarget.hasF16C()) {
1706 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1709 }
1710 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1713 }
1714 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1715 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1716 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1717 }
1718 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1719 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1720 }
1721
1722 // This block controls legalization of the mask vector sizes that are
1723 // available with AVX512. 512-bit vectors are in a separate block controlled
1724 // by useAVX512Regs.
1725 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1726 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1727 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1728 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1729 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1730 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1731
1735
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1738 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1739 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1742 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1743 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1751
1752 // There is no byte sized k-register load or store without AVX512DQ.
1753 if (!Subtarget.hasDQI()) {
1754 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1756 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1757 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1758
1763 }
1764
1765 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1766 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1770 }
1771
1772 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1774
1775 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1779
1786 }
1787
1788 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1790 }
1791 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1792 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1795 }
1796 }
1797
1798 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1799 // elements. 512-bits can be disabled based on prefer-vector-width and
1800 // required-vector-width function attributes.
1801 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1802 bool HasBWI = Subtarget.hasBWI();
1803
1804 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1809 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1810 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1811
1812 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1813 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1816 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1817 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1818 if (HasBWI)
1819 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1820 }
1821
1822 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1833 }
1834 setOperationAction(ISD::LRINT, MVT::v16f32,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 setOperationAction(ISD::LRINT, MVT::v8f64,
1837 Subtarget.hasDQI() ? Legal : Custom);
1838 if (Subtarget.hasDQI())
1839 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1840
1841 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1846 }
1847
1848 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1853 }
1854
1861
1873
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1875 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1876 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1877 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1878 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1879 if (HasBWI)
1880 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1881
1882 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1883 // to 512-bit rather than use the AVX2 instructions so that we can use
1884 // k-masks.
1885 if (!Subtarget.hasVLX()) {
1886 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1887 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1890 }
1891 }
1892
1894 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1895 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1905
1906 if (HasBWI) {
1907 // Extends from v64i1 masks to 512-bit vectors.
1911 }
1912
1913 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1926
1928 }
1929
1930 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1933 }
1934
1935 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1937 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1938 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1939
1940 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1941 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1942 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1943 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1944
1945 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1946 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1947 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1948 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1950 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1951 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1952 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1953
1954 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1955 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1956
1957 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1967
1968 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1969 // setcc all the way to isel and prefer SETGT in some isel patterns.
1972 }
1973
1974 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1975 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1980
1981 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1988 }
1989
1990 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1991 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1992 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1994 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1996 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1997 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2002 }
2003
2004 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2008 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2009 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2010
2011 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2015 setOperationAction(Opc, MVT::v8i64, Custom);
2016
2017 if (Subtarget.hasDQI())
2018 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2019
2020 if (Subtarget.hasCDI()) {
2021 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2022 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2024 }
2025 } // Subtarget.hasCDI()
2026
2027 if (Subtarget.hasVPOPCNTDQ()) {
2028 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2030 }
2031
2032 // Extract subvector is special because the value type
2033 // (result) is 256-bit but the source is 512-bit wide.
2034 // 128-bit was made Legal under AVX1.
2035 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2036 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2038
2039 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2040 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2050 }
2051 setF16Action(MVT::v32f16, Expand);
2056 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2057 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2058 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2059
2060 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2065 }
2066 if (HasBWI) {
2067 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2070 }
2071 } else {
2072 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2073 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2074 }
2075
2076 if (Subtarget.hasVBMI2()) {
2077 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2080 }
2081
2082 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2083 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2084 }
2085
2086 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2087 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2089 }// useAVX512Regs
2090
2091 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2092 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2093 MVT::v4i64}) {
2096 }
2097 }
2098
2099 // This block controls legalization for operations that don't have
2100 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2101 // narrower widths.
2102 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2103 // These operations are handled on non-VLX by artificially widening in
2104 // isel patterns.
2105
2109
2110 if (Subtarget.hasDQI()) {
2111 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2112 // v2f32 UINT_TO_FP is already custom under SSE2.
2115 "Unexpected operation action!");
2116 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2121 }
2122
2123 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2129 }
2130
2131 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2134 }
2135
2136 // Custom legalize 2x32 to get a little better code.
2139
2140 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2141 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2143
2144 if (Subtarget.hasDQI()) {
2148 setOperationAction(Opc, MVT::v2i64, Custom);
2149 setOperationAction(Opc, MVT::v4i64, Custom);
2150 }
2151 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2152 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2153 }
2154
2155 if (Subtarget.hasCDI()) {
2156 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2158 }
2159 } // Subtarget.hasCDI()
2160
2161 if (Subtarget.hasVPOPCNTDQ()) {
2162 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2164 }
2165
2166 // We can try to convert vectors to different sizes to leverage legal
2167 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2168 // then specialize to Legal below.
2169 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2170 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2171 MVT::v16i16, MVT::v8i8})
2173
2174 // Legal vpcompress depends on various AVX512 extensions.
2175 // Legal in AVX512F
2176 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2178
2179 // Legal in AVX512F + AVX512VL
2180 if (Subtarget.hasVLX())
2181 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2182 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2184
2185 // Legal in AVX512F + AVX512VBMI2
2186 if (Subtarget.hasVBMI2())
2187 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2189
2190 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2191 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2192 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2194 }
2195
2196 // This block control legalization of v32i1/v64i1 which are available with
2197 // AVX512BW..
2198 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2199 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2200 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2201
2202 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2213 }
2214
2215 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2217
2218 // Extends from v32i1 masks to 256-bit vectors.
2222
2223 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2224 MVT::v16f16, MVT::v8f16}) {
2225 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2226 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2227 }
2228
2229 // These operations are handled on non-VLX by artificially widening in
2230 // isel patterns.
2231 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2232
2233 if (Subtarget.hasBITALG()) {
2234 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2236 }
2237 }
2238
2239 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2240 auto setGroup = [&] (MVT VT) {
2251
2264
2266
2269
2275
2281
2285 };
2286
2287 // AVX512_FP16 scalar operations
2288 setGroup(MVT::f16);
2306
2309
2310 if (Subtarget.useAVX512Regs()) {
2311 setGroup(MVT::v32f16);
2317 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2324
2329 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2331 MVT::v32i16);
2332 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2334 MVT::v32i16);
2335 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2337 MVT::v32i16);
2338 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2340 MVT::v32i16);
2341
2345
2346 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2347 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2348
2353 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2354 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2355 }
2356
2361
2362 if (Subtarget.hasVLX()) {
2363 setGroup(MVT::v8f16);
2364 setGroup(MVT::v16f16);
2365
2376
2383
2384 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2387
2391
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2394 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2395 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2396
2397 // Need to custom widen these to prevent scalarization.
2398 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2399 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2400
2405
2410 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2411 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2412 }
2413 }
2414
2415 if (!Subtarget.useSoftFloat() &&
2416 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2417 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2418 : &X86::VR128RegClass);
2419 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2420 : &X86::VR256RegClass);
2421 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2422 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2423 // Set the operation action Custom to do the customization later.
2426 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2427 setF16Action(VT, Expand);
2428 if (!Subtarget.hasBF16())
2434 }
2435 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2436 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2437 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2438 }
2439 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2440 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2442 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2443 }
2444
2445 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2446 Subtarget.useAVX512Regs()) {
2447 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2448 setF16Action(MVT::v32bf16, Expand);
2449 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2450 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2451 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2453 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2457 }
2458
2459 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2460 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2472 }
2473 if (Subtarget.hasAVX10_2_512()) {
2474 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2475 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2476 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2477 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2478 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2479 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2480 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2481 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2482 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2485 }
2486 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2489 }
2490 }
2491
2492 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2493 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2494 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2495 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2496 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2497 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2498
2499 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2500 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2501 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2502 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2503 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2504
2505 if (Subtarget.hasBWI()) {
2506 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2507 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2508 }
2509
2510 if (Subtarget.hasFP16()) {
2511 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2520 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2529 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2534 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2539 }
2540 }
2541
2542 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2543 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2544 }
2545
2546 // We want to custom lower some of our intrinsics.
2550 if (!Subtarget.is64Bit()) {
2552 }
2553
2554 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2555 // handle type legalization for these operations here.
2556 //
2557 // FIXME: We really should do custom legalization for addition and
2558 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2559 // than generic legalization for 64-bit multiplication-with-overflow, though.
2560 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2561 if (VT == MVT::i64 && !Subtarget.is64Bit())
2562 continue;
2563 // Add/Sub/Mul with overflow operations are custom lowered.
2570
2571 // Support carry in as value rather than glue.
2577 }
2578
2579 // Combine sin / cos into _sincos_stret if it is available.
2580 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2581 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2584 }
2585
2586 if (Subtarget.isTargetWin64()) {
2587 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2588 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2589 setOperationAction(ISD::SREM, MVT::i128, Custom);
2590 setOperationAction(ISD::UREM, MVT::i128, Custom);
2599 }
2600
2601 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2602 // is. We should promote the value to 64-bits to solve this.
2603 // This is what the CRT headers do - `fmodf` is an inline header
2604 // function casting to f64 and calling `fmod`.
2605 if (Subtarget.is32Bit() &&
2606 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2607 // clang-format off
2608 for (ISD::NodeType Op :
2626 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2627 ISD::FMODF})
2628 if (isOperationExpand(Op, MVT::f32))
2629 setOperationAction(Op, MVT::f32, Promote);
2630 // clang-format on
2631
2632 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2633 // it, but it's just a wrapper around ldexp.
2634 if (Subtarget.isOSWindows()) {
2636 if (isOperationExpand(Op, MVT::f32))
2637 setOperationAction(Op, MVT::f32, Promote);
2638 }
2639
2640 // We have target-specific dag combine patterns for the following nodes:
2651 ISD::SHL,
2652 ISD::SRA,
2653 ISD::SRL,
2654 ISD::OR,
2655 ISD::AND,
2661 ISD::ADD,
2662 ISD::FADD,
2663 ISD::FSUB,
2664 ISD::FNEG,
2665 ISD::FMA,
2669 ISD::SUB,
2670 ISD::LOAD,
2671 ISD::LRINT,
2673 ISD::MLOAD,
2674 ISD::STORE,
2691 ISD::SETCC,
2692 ISD::MUL,
2693 ISD::XOR,
2704
2706
2707 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2709 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2711 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2713
2714 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2715 // that needs to benchmarked and balanced with the potential use of vector
2716 // load/store types (PR33329, PR33914).
2719
2720 // Default loop alignment, which can be overridden by -align-loops.
2722
2723 // An out-of-order CPU can speculatively execute past a predictable branch,
2724 // but a conditional move could be stalled by an expensive earlier operation.
2725 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2726 EnableExtLdPromotion = true;
2728
2730
2731 // Default to having -disable-strictnode-mutation on
2732 IsStrictFPEnabled = true;
2733}
2734
2735// This has so far only been implemented for 64-bit MachO.
2737 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2738}
2739
2741 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2742 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2743}
2744
2746 const SDLoc &DL) const {
2747 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2748 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2749 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2750 return SDValue(Node, 0);
2751}
2752
2755 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2756 !Subtarget.hasBWI())
2757 return TypeSplitVector;
2758
2759 // Since v8f16 is legal, widen anything over v4f16.
2760 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2761 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2762 VT.getVectorElementType() == MVT::f16)
2763 return TypeSplitVector;
2764
2765 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2766 VT.getVectorElementType() != MVT::i1)
2767 return TypeWidenVector;
2768
2770}
2771
2772FastISel *
2774 const TargetLibraryInfo *libInfo) const {
2775 return X86::createFastISel(funcInfo, libInfo);
2776}
2777
2778//===----------------------------------------------------------------------===//
2779// Other Lowering Hooks
2780//===----------------------------------------------------------------------===//
2781
2783 bool AssumeSingleUse) {
2784 if (!AssumeSingleUse && !Op.hasOneUse())
2785 return false;
2786 if (!ISD::isNormalLoad(Op.getNode()))
2787 return false;
2788
2789 // If this is an unaligned vector, make sure the target supports folding it.
2790 auto *Ld = cast<LoadSDNode>(Op.getNode());
2791 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2792 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2793 return false;
2794
2795 // TODO: If this is a non-temporal load and the target has an instruction
2796 // for it, it should not be folded. See "useNonTemporalLoad()".
2797
2798 return true;
2799}
2800
2802 const X86Subtarget &Subtarget,
2803 bool AssumeSingleUse) {
2804 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2805 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2806 return false;
2807
2808 // We can not replace a wide volatile load with a broadcast-from-memory,
2809 // because that would narrow the load, which isn't legal for volatiles.
2810 auto *Ld = cast<LoadSDNode>(Op.getNode());
2811 return !Ld->isVolatile() ||
2812 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2813}
2814
2816 if (!Op.hasOneUse())
2817 return false;
2818 // Peek through (oneuse) bitcast users
2819 SDNode *User = *Op->user_begin();
2820 while (User->getOpcode() == ISD::BITCAST) {
2821 if (!User->hasOneUse())
2822 return false;
2823 User = *User->user_begin();
2824 }
2825 return ISD::isNormalStore(User);
2826}
2827
2829 if (Op.hasOneUse()) {
2830 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2831 return (ISD::ZERO_EXTEND == Opcode);
2832 }
2833 return false;
2834}
2835
2836static bool isLogicOp(unsigned Opcode) {
2837 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2838 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2839}
2840
2841static bool isTargetShuffle(unsigned Opcode) {
2842 switch(Opcode) {
2843 default: return false;
2844 case X86ISD::BLENDI:
2845 case X86ISD::PSHUFB:
2846 case X86ISD::PSHUFD:
2847 case X86ISD::PSHUFHW:
2848 case X86ISD::PSHUFLW:
2849 case X86ISD::SHUFP:
2850 case X86ISD::INSERTPS:
2851 case X86ISD::EXTRQI:
2852 case X86ISD::INSERTQI:
2853 case X86ISD::VALIGN:
2854 case X86ISD::PALIGNR:
2855 case X86ISD::VSHLDQ:
2856 case X86ISD::VSRLDQ:
2857 case X86ISD::MOVLHPS:
2858 case X86ISD::MOVHLPS:
2859 case X86ISD::MOVSHDUP:
2860 case X86ISD::MOVSLDUP:
2861 case X86ISD::MOVDDUP:
2862 case X86ISD::MOVSS:
2863 case X86ISD::MOVSD:
2864 case X86ISD::MOVSH:
2865 case X86ISD::UNPCKL:
2866 case X86ISD::UNPCKH:
2867 case X86ISD::VBROADCAST:
2868 case X86ISD::VPERMILPI:
2869 case X86ISD::VPERMILPV:
2870 case X86ISD::VPERM2X128:
2871 case X86ISD::SHUF128:
2872 case X86ISD::VPERMIL2:
2873 case X86ISD::VPERMI:
2874 case X86ISD::VPPERM:
2875 case X86ISD::VPERMV:
2876 case X86ISD::VPERMV3:
2877 case X86ISD::VZEXT_MOVL:
2878 return true;
2879 }
2880}
2881
2882static bool isTargetShuffleVariableMask(unsigned Opcode) {
2883 switch (Opcode) {
2884 default: return false;
2885 // Target Shuffles.
2886 case X86ISD::PSHUFB:
2887 case X86ISD::VPERMILPV:
2888 case X86ISD::VPERMIL2:
2889 case X86ISD::VPPERM:
2890 case X86ISD::VPERMV:
2891 case X86ISD::VPERMV3:
2892 return true;
2893 // 'Faux' Target Shuffles.
2894 case ISD::OR:
2895 case ISD::AND:
2896 case X86ISD::ANDNP:
2897 return true;
2898 }
2899}
2900
2903 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2905 int ReturnAddrIndex = FuncInfo->getRAIndex();
2906
2907 if (ReturnAddrIndex == 0) {
2908 // Set up a frame object for the return address.
2909 unsigned SlotSize = RegInfo->getSlotSize();
2910 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2911 -(int64_t)SlotSize,
2912 false);
2913 FuncInfo->setRAIndex(ReturnAddrIndex);
2914 }
2915
2916 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2917}
2918
2920 bool HasSymbolicDisplacement) {
2921 // Offset should fit into 32 bit immediate field.
2922 if (!isInt<32>(Offset))
2923 return false;
2924
2925 // If we don't have a symbolic displacement - we don't have any extra
2926 // restrictions.
2927 if (!HasSymbolicDisplacement)
2928 return true;
2929
2930 // We can fold large offsets in the large code model because we always use
2931 // 64-bit offsets.
2932 if (CM == CodeModel::Large)
2933 return true;
2934
2935 // For kernel code model we know that all object resist in the negative half
2936 // of 32bits address space. We may not accept negative offsets, since they may
2937 // be just off and we may accept pretty large positive ones.
2938 if (CM == CodeModel::Kernel)
2939 return Offset >= 0;
2940
2941 // For other non-large code models we assume that latest small object is 16MB
2942 // before end of 31 bits boundary. We may also accept pretty large negative
2943 // constants knowing that all objects are in the positive half of address
2944 // space.
2945 return Offset < 16 * 1024 * 1024;
2946}
2947
2948/// Return true if the condition is an signed comparison operation.
2949static bool isX86CCSigned(X86::CondCode X86CC) {
2950 switch (X86CC) {
2951 default:
2952 llvm_unreachable("Invalid integer condition!");
2953 case X86::COND_E:
2954 case X86::COND_NE:
2955 case X86::COND_B:
2956 case X86::COND_A:
2957 case X86::COND_BE:
2958 case X86::COND_AE:
2959 return false;
2960 case X86::COND_G:
2961 case X86::COND_GE:
2962 case X86::COND_L:
2963 case X86::COND_LE:
2964 return true;
2965 }
2966}
2967
2969 switch (SetCCOpcode) {
2970 // clang-format off
2971 default: llvm_unreachable("Invalid integer condition!");
2972 case ISD::SETEQ: return X86::COND_E;
2973 case ISD::SETGT: return X86::COND_G;
2974 case ISD::SETGE: return X86::COND_GE;
2975 case ISD::SETLT: return X86::COND_L;
2976 case ISD::SETLE: return X86::COND_LE;
2977 case ISD::SETNE: return X86::COND_NE;
2978 case ISD::SETULT: return X86::COND_B;
2979 case ISD::SETUGT: return X86::COND_A;
2980 case ISD::SETULE: return X86::COND_BE;
2981 case ISD::SETUGE: return X86::COND_AE;
2982 // clang-format on
2983 }
2984}
2985
2986/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2987/// condition code, returning the condition code and the LHS/RHS of the
2988/// comparison to make.
2990 bool isFP, SDValue &LHS, SDValue &RHS,
2991 SelectionDAG &DAG) {
2992 if (!isFP) {
2993 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2994 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2995 // X > -1 -> X == 0, jump !sign.
2996 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2997 return X86::COND_NS;
2998 }
2999 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
3000 // X < 0 -> X == 0, jump on sign.
3001 return X86::COND_S;
3002 }
3003 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3004 // X >= 0 -> X == 0, jump on !sign.
3005 return X86::COND_NS;
3006 }
3007 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3008 // X < 1 -> X <= 0
3009 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3010 return X86::COND_LE;
3011 }
3012 }
3013
3014 return TranslateIntegerX86CC(SetCCOpcode);
3015 }
3016
3017 // First determine if it is required or is profitable to flip the operands.
3018
3019 // If LHS is a foldable load, but RHS is not, flip the condition.
3020 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3021 !ISD::isNON_EXTLoad(RHS.getNode())) {
3022 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3023 std::swap(LHS, RHS);
3024 }
3025
3026 switch (SetCCOpcode) {
3027 default: break;
3028 case ISD::SETOLT:
3029 case ISD::SETOLE:
3030 case ISD::SETUGT:
3031 case ISD::SETUGE:
3032 std::swap(LHS, RHS);
3033 break;
3034 }
3035
3036 // On a floating point condition, the flags are set as follows:
3037 // ZF PF CF op
3038 // 0 | 0 | 0 | X > Y
3039 // 0 | 0 | 1 | X < Y
3040 // 1 | 0 | 0 | X == Y
3041 // 1 | 1 | 1 | unordered
3042 switch (SetCCOpcode) {
3043 // clang-format off
3044 default: llvm_unreachable("Condcode should be pre-legalized away");
3045 case ISD::SETUEQ:
3046 case ISD::SETEQ: return X86::COND_E;
3047 case ISD::SETOLT: // flipped
3048 case ISD::SETOGT:
3049 case ISD::SETGT: return X86::COND_A;
3050 case ISD::SETOLE: // flipped
3051 case ISD::SETOGE:
3052 case ISD::SETGE: return X86::COND_AE;
3053 case ISD::SETUGT: // flipped
3054 case ISD::SETULT:
3055 case ISD::SETLT: return X86::COND_B;
3056 case ISD::SETUGE: // flipped
3057 case ISD::SETULE:
3058 case ISD::SETLE: return X86::COND_BE;
3059 case ISD::SETONE:
3060 case ISD::SETNE: return X86::COND_NE;
3061 case ISD::SETUO: return X86::COND_P;
3062 case ISD::SETO: return X86::COND_NP;
3063 case ISD::SETOEQ:
3064 case ISD::SETUNE: return X86::COND_INVALID;
3065 // clang-format on
3066 }
3067}
3068
3069/// Is there a floating point cmov for the specific X86 condition code?
3070/// Current x86 isa includes the following FP cmov instructions:
3071/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3072static bool hasFPCMov(unsigned X86CC) {
3073 switch (X86CC) {
3074 default:
3075 return false;
3076 case X86::COND_B:
3077 case X86::COND_BE:
3078 case X86::COND_E:
3079 case X86::COND_P:
3080 case X86::COND_A:
3081 case X86::COND_AE:
3082 case X86::COND_NE:
3083 case X86::COND_NP:
3084 return true;
3085 }
3086}
3087
3088static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3089 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3090 VT.is512BitVector();
3091}
3092
3094 const CallInst &I,
3095 MachineFunction &MF,
3096 unsigned Intrinsic) const {
3098 Info.offset = 0;
3099
3100 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3101 if (!IntrData) {
3102 switch (Intrinsic) {
3103 case Intrinsic::x86_aesenc128kl:
3104 case Intrinsic::x86_aesdec128kl:
3106 Info.ptrVal = I.getArgOperand(1);
3107 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3108 Info.align = Align(1);
3110 return true;
3111 case Intrinsic::x86_aesenc256kl:
3112 case Intrinsic::x86_aesdec256kl:
3114 Info.ptrVal = I.getArgOperand(1);
3115 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3116 Info.align = Align(1);
3118 return true;
3119 case Intrinsic::x86_aesencwide128kl:
3120 case Intrinsic::x86_aesdecwide128kl:
3122 Info.ptrVal = I.getArgOperand(0);
3123 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3124 Info.align = Align(1);
3126 return true;
3127 case Intrinsic::x86_aesencwide256kl:
3128 case Intrinsic::x86_aesdecwide256kl:
3130 Info.ptrVal = I.getArgOperand(0);
3131 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3132 Info.align = Align(1);
3134 return true;
3135 case Intrinsic::x86_cmpccxadd32:
3136 case Intrinsic::x86_cmpccxadd64:
3137 case Intrinsic::x86_atomic_bts:
3138 case Intrinsic::x86_atomic_btc:
3139 case Intrinsic::x86_atomic_btr: {
3141 Info.ptrVal = I.getArgOperand(0);
3142 unsigned Size = I.getType()->getScalarSizeInBits();
3143 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3144 Info.align = Align(Size);
3147 return true;
3148 }
3149 case Intrinsic::x86_atomic_bts_rm:
3150 case Intrinsic::x86_atomic_btc_rm:
3151 case Intrinsic::x86_atomic_btr_rm: {
3153 Info.ptrVal = I.getArgOperand(0);
3154 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3155 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3156 Info.align = Align(Size);
3159 return true;
3160 }
3161 case Intrinsic::x86_aadd32:
3162 case Intrinsic::x86_aadd64:
3163 case Intrinsic::x86_aand32:
3164 case Intrinsic::x86_aand64:
3165 case Intrinsic::x86_aor32:
3166 case Intrinsic::x86_aor64:
3167 case Intrinsic::x86_axor32:
3168 case Intrinsic::x86_axor64:
3169 case Intrinsic::x86_atomic_add_cc:
3170 case Intrinsic::x86_atomic_sub_cc:
3171 case Intrinsic::x86_atomic_or_cc:
3172 case Intrinsic::x86_atomic_and_cc:
3173 case Intrinsic::x86_atomic_xor_cc: {
3175 Info.ptrVal = I.getArgOperand(0);
3176 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3177 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3178 Info.align = Align(Size);
3181 return true;
3182 }
3183 }
3184 return false;
3185 }
3186
3187 switch (IntrData->Type) {
3190 case TRUNCATE_TO_MEM_VI32: {
3192 Info.ptrVal = I.getArgOperand(0);
3193 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3195 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3196 ScalarVT = MVT::i8;
3197 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3198 ScalarVT = MVT::i16;
3199 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3200 ScalarVT = MVT::i32;
3201
3202 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3203 Info.align = Align(1);
3205 break;
3206 }
3207 case GATHER:
3208 case GATHER_AVX2: {
3210 Info.ptrVal = nullptr;
3211 MVT DataVT = MVT::getVT(I.getType());
3212 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3213 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3214 IndexVT.getVectorNumElements());
3215 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3216 Info.align = Align(1);
3218 break;
3219 }
3220 case SCATTER: {
3222 Info.ptrVal = nullptr;
3223 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3224 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3225 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3226 IndexVT.getVectorNumElements());
3227 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3228 Info.align = Align(1);
3230 break;
3231 }
3232 default:
3233 return false;
3234 }
3235
3236 return true;
3237}
3238
3239/// Returns true if the target can instruction select the
3240/// specified FP immediate natively. If false, the legalizer will
3241/// materialize the FP immediate as a load from a constant pool.
3243 bool ForCodeSize) const {
3244 for (const APFloat &FPImm : LegalFPImmediates)
3245 if (Imm.bitwiseIsEqual(FPImm))
3246 return true;
3247 return false;
3248}
3249
3251 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3252 std::optional<unsigned> ByteOffset) const {
3253 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3254
3255 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3256 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3257 N = *N->user_begin();
3258 return N;
3259 };
3260
3261 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3262 // relocation target a movq or addq instruction: don't let the load shrink.
3263 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3264 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3265 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3266 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3267
3268 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3269 // those uses are extracted directly into a store, then the extract + store
3270 // can be store-folded, or (4) any use will be used by legal full width
3271 // instruction. Then, it's probably not worth splitting the load.
3272 EVT VT = Load->getValueType(0);
3273 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3274 !SDValue(Load, 0).hasOneUse()) {
3275 bool FullWidthUse = false;
3276 bool AllExtractStores = true;
3277 for (SDUse &Use : Load->uses()) {
3278 // Skip uses of the chain value. Result 0 of the node is the load value.
3279 if (Use.getResNo() != 0)
3280 continue;
3281
3282 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3283
3284 // If this use is an extract + store, it's probably not worth splitting.
3285 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3286 all_of(User->uses(), [&](const SDUse &U) {
3287 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3288 return Inner->getOpcode() == ISD::STORE;
3289 }))
3290 continue;
3291
3292 AllExtractStores = false;
3293
3294 // If any use is a full width legal/target bin op, then assume its legal
3295 // and won't split.
3296 if (isBinOp(User->getOpcode()) &&
3297 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3298 User->getOpcode() > ISD::BUILTIN_OP_END))
3299 FullWidthUse = true;
3300 }
3301
3302 if (AllExtractStores)
3303 return false;
3304
3305 // If we have an user that uses the full vector width, then this use is
3306 // only worth splitting if the offset isn't 0 (to avoid an
3307 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3308 if (FullWidthUse)
3309 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3310 }
3311
3312 return true;
3313}
3314
3315/// Returns true if it is beneficial to convert a load of a constant
3316/// to just the constant itself.
3318 Type *Ty) const {
3319 assert(Ty->isIntegerTy());
3320
3321 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3322 if (BitSize == 0 || BitSize > 64)
3323 return false;
3324 return true;
3325}
3326
3328 // If we are using XMM registers in the ABI and the condition of the select is
3329 // a floating-point compare and we have blendv or conditional move, then it is
3330 // cheaper to select instead of doing a cross-register move and creating a
3331 // load that depends on the compare result.
3332 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3333 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3334}
3335
3337 // TODO: It might be a win to ease or lift this restriction, but the generic
3338 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3339 if (VT.isVector() && Subtarget.hasAVX512())
3340 return false;
3341
3342 return true;
3343}
3344
3346 SDValue C) const {
3347 // TODO: We handle scalars using custom code, but generic combining could make
3348 // that unnecessary.
3349 APInt MulC;
3350 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3351 return false;
3352
3353 // Find the type this will be legalized too. Otherwise we might prematurely
3354 // convert this to shl+add/sub and then still have to type legalize those ops.
3355 // Another choice would be to defer the decision for illegal types until
3356 // after type legalization. But constant splat vectors of i64 can't make it
3357 // through type legalization on 32-bit targets so we would need to special
3358 // case vXi64.
3359 while (getTypeAction(Context, VT) != TypeLegal)
3360 VT = getTypeToTransformTo(Context, VT);
3361
3362 // If vector multiply is legal, assume that's faster than shl + add/sub.
3363 // Multiply is a complex op with higher latency and lower throughput in
3364 // most implementations, sub-vXi32 vector multiplies are always fast,
3365 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3366 // is always going to be slow.
3367 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3368 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3369 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3370 return false;
3371
3372 // shl+add, shl+sub, shl+add+neg
3373 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3374 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3375}
3376
3378 unsigned Index) const {
3380 return false;
3381
3382 // Mask vectors support all subregister combinations and operations that
3383 // extract half of vector.
3384 if (ResVT.getVectorElementType() == MVT::i1)
3385 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3386 (Index == ResVT.getVectorNumElements()));
3387
3388 return (Index % ResVT.getVectorNumElements()) == 0;
3389}
3390
3392 unsigned Opc = VecOp.getOpcode();
3393
3394 // Assume target opcodes can't be scalarized.
3395 // TODO - do we have any exceptions?
3396 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3397 return false;
3398
3399 // If the vector op is not supported, try to convert to scalar.
3400 EVT VecVT = VecOp.getValueType();
3402 return true;
3403
3404 // If the vector op is supported, but the scalar op is not, the transform may
3405 // not be worthwhile.
3406 EVT ScalarVT = VecVT.getScalarType();
3407 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3408}
3409
3411 bool) const {
3412 // TODO: Allow vectors?
3413 if (VT.isVector())
3414 return false;
3415 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3416}
3417
3419 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3420 // i32/i64 or can rely on BSF passthrough value.
3421 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3422 Subtarget.hasBitScanPassThrough() ||
3423 (!Ty->isVectorTy() &&
3424 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3425}
3426
3428 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3429 // passthrough value.
3430 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3431 Subtarget.hasBitScanPassThrough();
3432}
3433
3435 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3436 // expensive than a straight movsd. On the other hand, it's important to
3437 // shrink long double fp constant since fldt is very slow.
3438 return !Subtarget.hasSSE2() || VT == MVT::f80;
3439}
3440
3442 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3443 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3444}
3445
3447 const SelectionDAG &DAG,
3448 const MachineMemOperand &MMO) const {
3449 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3450 BitcastVT.getVectorElementType() == MVT::i1)
3451 return false;
3452
3453 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3454 return false;
3455
3456 // If both types are legal vectors, it's always ok to convert them.
3457 if (LoadVT.isVector() && BitcastVT.isVector() &&
3458 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3459 return true;
3460
3461 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3462}
3463
3465 const MachineFunction &MF) const {
3466 // Do not merge to float value size (128 bytes) if no implicit
3467 // float attribute is set.
3468 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3469
3470 if (NoFloat) {
3471 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3472 return (MemVT.getSizeInBits() <= MaxIntSize);
3473 }
3474 // Make sure we don't merge greater than our preferred vector
3475 // width.
3476 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3477 return false;
3478
3479 return true;
3480}
3481
3483 return Subtarget.hasFastLZCNT();
3484}
3485
3487 const Instruction &AndI) const {
3488 return true;
3489}
3490
3492 EVT VT = Y.getValueType();
3493
3494 if (VT.isVector())
3495 return false;
3496
3497 if (!Subtarget.hasBMI())
3498 return false;
3499
3500 // There are only 32-bit and 64-bit forms for 'andn'.
3501 if (VT != MVT::i32 && VT != MVT::i64)
3502 return false;
3503
3504 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3505}
3506
3508 EVT VT = Y.getValueType();
3509
3510 if (!VT.isVector())
3511 return hasAndNotCompare(Y);
3512
3513 // Vector.
3514
3515 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3516 return false;
3517
3518 if (VT == MVT::v4i32)
3519 return true;
3520
3521 return Subtarget.hasSSE2();
3522}
3523
3525 return X.getValueType().isScalarInteger(); // 'bt'
3526}
3527
3531 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3532 SelectionDAG &DAG) const {
3533 // Does baseline recommend not to perform the fold by default?
3535 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3536 return false;
3537 // For scalars this transform is always beneficial.
3538 if (X.getValueType().isScalarInteger())
3539 return true;
3540 // If all the shift amounts are identical, then transform is beneficial even
3541 // with rudimentary SSE2 shifts.
3542 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3543 return true;
3544 // If we have AVX2 with it's powerful shift operations, then it's also good.
3545 if (Subtarget.hasAVX2())
3546 return true;
3547 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3548 return NewShiftOpcode == ISD::SHL;
3549}
3550
3552 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3553 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3554 if (!VT.isInteger())
3555 return ShiftOpc;
3556
3557 bool PreferRotate = false;
3558 if (VT.isVector()) {
3559 // For vectors, if we have rotate instruction support, then its definetly
3560 // best. Otherwise its not clear what the best so just don't make changed.
3561 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3562 VT.getScalarType() == MVT::i64);
3563 } else {
3564 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3565 // rotate unless we have a zext mask+shr.
3566 PreferRotate = Subtarget.hasBMI2();
3567 if (!PreferRotate) {
3568 unsigned MaskBits =
3569 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3570 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3571 }
3572 }
3573
3574 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3575 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3576
3577 if (PreferRotate && MayTransformRotate)
3578 return ISD::ROTL;
3579
3580 // If vector we don't really get much benefit swapping around constants.
3581 // Maybe we could check if the DAG has the flipped node already in the
3582 // future.
3583 if (VT.isVector())
3584 return ShiftOpc;
3585
3586 // See if the beneficial to swap shift type.
3587 if (ShiftOpc == ISD::SHL) {
3588 // If the current setup has imm64 mask, then inverse will have
3589 // at least imm32 mask (or be zext i32 -> i64).
3590 if (VT == MVT::i64)
3591 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3592 : ShiftOpc;
3593
3594 // We can only benefit if req at least 7-bit for the mask. We
3595 // don't want to replace shl of 1,2,3 as they can be implemented
3596 // with lea/add.
3597 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3598 }
3599
3600 if (VT == MVT::i64)
3601 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3602 // extremely efficient.
3603 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3604
3605 // Keep small shifts as shl so we can generate add/lea.
3606 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3607 }
3608
3609 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3610 // (PreferRotate will be set in the latter case).
3611 if (PreferRotate || !MayTransformRotate || VT.isVector())
3612 return ShiftOpc;
3613
3614 // Non-vector type and we have a zext mask with SRL.
3615 return ISD::SRL;
3616}
3617
3620 const Value *Lhs,
3621 const Value *Rhs) const {
3622 using namespace llvm::PatternMatch;
3623 int BaseCost = BrMergingBaseCostThresh.getValue();
3624 // With CCMP, branches can be merged in a more efficient way.
3625 if (BaseCost >= 0 && Subtarget.hasCCMP())
3626 BaseCost += BrMergingCcmpBias;
3627 // a == b && a == c is a fast pattern on x86.
3628 if (BaseCost >= 0 && Opc == Instruction::And &&
3631 BaseCost += 1;
3632 return {BaseCost, BrMergingLikelyBias.getValue(),
3633 BrMergingUnlikelyBias.getValue()};
3634}
3635
3637 return N->getOpcode() != ISD::FP_EXTEND;
3638}
3639
3641 const SDNode *N, CombineLevel Level) const {
3642 assert(((N->getOpcode() == ISD::SHL &&
3643 N->getOperand(0).getOpcode() == ISD::SRL) ||
3644 (N->getOpcode() == ISD::SRL &&
3645 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3646 "Expected shift-shift mask");
3647 // TODO: Should we always create i64 masks? Or only folded immediates?
3648 EVT VT = N->getValueType(0);
3649 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3650 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3651 // Only fold if the shift values are equal - so it folds to AND.
3652 // TODO - we should fold if either is a non-uniform vector but we don't do
3653 // the fold for non-splats yet.
3654 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3655 }
3657}
3658
3660 EVT VT = Y.getValueType();
3661
3662 // For vectors, we don't have a preference, but we probably want a mask.
3663 if (VT.isVector())
3664 return false;
3665
3666 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3667 if (VT == MVT::i64 && !Subtarget.is64Bit())
3668 return false;
3669
3670 return true;
3671}
3672
3675 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3677 !Subtarget.isOSWindows())
3680 ExpansionFactor);
3681}
3682
3684 // Any legal vector type can be splatted more efficiently than
3685 // loading/spilling from memory.
3686 return isTypeLegal(VT);
3687}
3688
3690 MVT VT = MVT::getIntegerVT(NumBits);
3691 if (isTypeLegal(VT))
3692 return VT;
3693
3694 // PMOVMSKB can handle this.
3695 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3696 return MVT::v16i8;
3697
3698 // VPMOVMSKB can handle this.
3699 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3700 return MVT::v32i8;
3701
3702 // TODO: Allow 64-bit type for 32-bit target.
3703 // TODO: 512-bit types should be allowed, but make sure that those
3704 // cases are handled in combineVectorSizedSetCCEquality().
3705
3707}
3708
3709/// Val is the undef sentinel value or equal to the specified value.
3710static bool isUndefOrEqual(int Val, int CmpVal) {
3711 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3712}
3713
3714/// Return true if every element in Mask is the undef sentinel value or equal to
3715/// the specified value.
3716static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3717 return llvm::all_of(Mask, [CmpVal](int M) {
3718 return (M == SM_SentinelUndef) || (M == CmpVal);
3719 });
3720}
3721
3722/// Return true if every element in Mask, beginning from position Pos and ending
3723/// in Pos+Size is the undef sentinel value or equal to the specified value.
3724static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3725 unsigned Size) {
3726 return llvm::all_of(Mask.slice(Pos, Size),
3727 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3728}
3729
3730/// Val is either the undef or zero sentinel value.
3731static bool isUndefOrZero(int Val) {
3732 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3733}
3734
3735/// Return true if every element in Mask, beginning from position Pos and ending
3736/// in Pos+Size is the undef sentinel value.
3737static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3738 return llvm::all_of(Mask.slice(Pos, Size),
3739 [](int M) { return M == SM_SentinelUndef; });
3740}
3741
3742/// Return true if the mask creates a vector whose lower half is undefined.
3744 unsigned NumElts = Mask.size();
3745 return isUndefInRange(Mask, 0, NumElts / 2);
3746}
3747
3748/// Return true if the mask creates a vector whose upper half is undefined.
3750 unsigned NumElts = Mask.size();
3751 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3752}
3753
3754/// Return true if Val falls within the specified range (L, H].
3755static bool isInRange(int Val, int Low, int Hi) {
3756 return (Val >= Low && Val < Hi);
3757}
3758
3759/// Return true if the value of any element in Mask falls within the specified
3760/// range (L, H].
3761static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3762 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3763}
3764
3765/// Return true if the value of any element in Mask is the zero sentinel value.
3766static bool isAnyZero(ArrayRef<int> Mask) {
3767 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3768}
3769
3770/// Return true if Val is undef or if its value falls within the
3771/// specified range (L, H].
3772static bool isUndefOrInRange(int Val, int Low, int Hi) {
3773 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3774}
3775
3776/// Return true if every element in Mask is undef or if its value
3777/// falls within the specified range (L, H].
3778static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3779 return llvm::all_of(
3780 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3781}
3782
3783/// Return true if Val is undef, zero or if its value falls within the
3784/// specified range (L, H].
3785static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3786 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3787}
3788
3789/// Return true if every element in Mask is undef, zero or if its value
3790/// falls within the specified range (L, H].
3791static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3792 return llvm::all_of(
3793 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3794}
3795
3796/// Return true if every element in Mask, is an in-place blend/select mask or is
3797/// undef.
3799 unsigned NumElts = Mask.size();
3800 for (auto [I, M] : enumerate(Mask))
3801 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3802 return false;
3803 return true;
3804}
3805
3806/// Return true if every element in Mask, beginning
3807/// from position Pos and ending in Pos + Size, falls within the specified
3808/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3809static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3810 unsigned Size, int Low, int Step = 1) {
3811 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3812 if (!isUndefOrEqual(Mask[i], Low))
3813 return false;
3814 return true;
3815}
3816
3817/// Return true if every element in Mask, beginning
3818/// from position Pos and ending in Pos+Size, falls within the specified
3819/// sequential range (Low, Low+Size], or is undef or is zero.
3821 unsigned Size, int Low,
3822 int Step = 1) {
3823 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3824 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3825 return false;
3826 return true;
3827}
3828
3829/// Return true if every element in Mask, beginning
3830/// from position Pos and ending in Pos+Size is undef or is zero.
3831static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3832 unsigned Size) {
3833 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3834}
3835
3836/// Return true if every element of a single input is referenced by the shuffle
3837/// mask. i.e. it just permutes them all.
3839 unsigned NumElts = Mask.size();
3840 APInt DemandedElts = APInt::getZero(NumElts);
3841 for (int M : Mask)
3842 if (isInRange(M, 0, NumElts))
3843 DemandedElts.setBit(M);
3844 return DemandedElts.isAllOnes();
3845}
3846
3847/// Helper function to test whether a shuffle mask could be
3848/// simplified by widening the elements being shuffled.
3849///
3850/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3851/// leaves it in an unspecified state.
3852///
3853/// NOTE: This must handle normal vector shuffle masks and *target* vector
3854/// shuffle masks. The latter have the special property of a '-2' representing
3855/// a zero-ed lane of a vector.
3857 SmallVectorImpl<int> &WidenedMask) {
3858 WidenedMask.assign(Mask.size() / 2, 0);
3859 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3860 int M0 = Mask[i];
3861 int M1 = Mask[i + 1];
3862
3863 // If both elements are undef, its trivial.
3864 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3865 WidenedMask[i / 2] = SM_SentinelUndef;
3866 continue;
3867 }
3868
3869 // Check for an undef mask and a mask value properly aligned to fit with
3870 // a pair of values. If we find such a case, use the non-undef mask's value.
3871 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3872 WidenedMask[i / 2] = M1 / 2;
3873 continue;
3874 }
3875 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3876 WidenedMask[i / 2] = M0 / 2;
3877 continue;
3878 }
3879
3880 // When zeroing, we need to spread the zeroing across both lanes to widen.
3881 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3882 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3884 WidenedMask[i / 2] = SM_SentinelZero;
3885 continue;
3886 }
3887 return false;
3888 }
3889
3890 // Finally check if the two mask values are adjacent and aligned with
3891 // a pair.
3892 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3893 WidenedMask[i / 2] = M0 / 2;
3894 continue;
3895 }
3896
3897 // Otherwise we can't safely widen the elements used in this shuffle.
3898 return false;
3899 }
3900 assert(WidenedMask.size() == Mask.size() / 2 &&
3901 "Incorrect size of mask after widening the elements!");
3902
3903 return true;
3904}
3905
3907 const APInt &Zeroable,
3908 bool V2IsZero,
3909 SmallVectorImpl<int> &WidenedMask) {
3910 // Create an alternative mask with info about zeroable elements.
3911 // Here we do not set undef elements as zeroable.
3912 SmallVector<int, 64> ZeroableMask(Mask);
3913 if (V2IsZero) {
3914 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3915 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3916 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3917 ZeroableMask[i] = SM_SentinelZero;
3918 }
3919 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3920}
3921
3923 SmallVector<int, 32> WidenedMask;
3924 return canWidenShuffleElements(Mask, WidenedMask);
3925}
3926
3927// Attempt to narrow/widen shuffle mask until it matches the target number of
3928// elements.
3929static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3930 SmallVectorImpl<int> &ScaledMask) {
3931 unsigned NumSrcElts = Mask.size();
3932 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3933 "Illegal shuffle scale factor");
3934
3935 // Narrowing is guaranteed to work.
3936 if (NumDstElts >= NumSrcElts) {
3937 int Scale = NumDstElts / NumSrcElts;
3938 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3939 return true;
3940 }
3941
3942 // We have to repeat the widening until we reach the target size, but we can
3943 // split out the first widening as it sets up ScaledMask for us.
3944 if (canWidenShuffleElements(Mask, ScaledMask)) {
3945 while (ScaledMask.size() > NumDstElts) {
3946 SmallVector<int, 16> WidenedMask;
3947 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3948 return false;
3949 ScaledMask = std::move(WidenedMask);
3950 }
3951 return true;
3952 }
3953
3954 return false;
3955}
3956
3957static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3958 SmallVector<int, 32> ScaledMask;
3959 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3960}
3961
3962// Helper to grow the shuffle mask for a larger value type.
3963// NOTE: This is different to scaleShuffleElements which is a same size type.
3964static void growShuffleMask(ArrayRef<int> SrcMask,
3965 SmallVectorImpl<int> &DstMask,
3966 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3967 assert(DstMask.empty() && "Expected an empty shuffle mas");
3968 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3969 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3970 unsigned NumSrcElts = SrcMask.size();
3971 DstMask.assign(SrcMask.begin(), SrcMask.end());
3972 for (int &M : DstMask) {
3973 if (M < 0)
3974 continue;
3975 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3976 }
3977 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3978}
3979
3980/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3982 return isNullConstant(Elt) || isNullFPConstant(Elt);
3983}
3984
3985// Build a vector of constants.
3986// Use an UNDEF node if MaskElt == -1.
3987// Split 64-bit constants in the 32-bit mode.
3989 const SDLoc &dl, bool IsMask = false) {
3990
3992 bool Split = false;
3993
3994 MVT ConstVecVT = VT;
3995 unsigned NumElts = VT.getVectorNumElements();
3996 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3997 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3998 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3999 Split = true;
4000 }
4001
4002 MVT EltVT = ConstVecVT.getVectorElementType();
4003 for (unsigned i = 0; i < NumElts; ++i) {
4004 bool IsUndef = Values[i] < 0 && IsMask;
4005 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(Values[i], dl, EltVT);
4007 Ops.push_back(OpNode);
4008 if (Split)
4009 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4010 DAG.getConstant(0, dl, EltVT));
4011 }
4012 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4013 if (Split)
4014 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4015 return ConstsNode;
4016}
4017
4018static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4019 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4020 assert(Bits.size() == Undefs.getBitWidth() &&
4021 "Unequal constant and undef arrays");
4023 bool Split = false;
4024
4025 MVT ConstVecVT = VT;
4026 unsigned NumElts = VT.getVectorNumElements();
4027 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4028 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4029 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4030 Split = true;
4031 }
4032
4033 MVT EltVT = ConstVecVT.getVectorElementType();
4034 MVT EltIntVT = EltVT.changeTypeToInteger();
4035 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4036 if (Undefs[i]) {
4037 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4038 continue;
4039 }
4040 const APInt &V = Bits[i];
4041 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4042 if (Split) {
4043 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4044 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4045 } else {
4046 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4047 }
4048 }
4049
4050 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4051 return DAG.getBitcast(VT, ConstsNode);
4052}
4053
4055 SelectionDAG &DAG, const SDLoc &dl) {
4056 APInt Undefs = APInt::getZero(Bits.size());
4057 return getConstVector(Bits, Undefs, VT, DAG, dl);
4058}
4059
4060/// Returns a vector of specified type with all zero elements.
4061static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4062 SelectionDAG &DAG, const SDLoc &dl) {
4063 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4064 VT.getVectorElementType() == MVT::i1) &&
4065 "Unexpected vector type");
4066
4067 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4068 // type. This ensures they get CSE'd. But if the integer type is not
4069 // available, use a floating-point +0.0 instead.
4070 SDValue Vec;
4071 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4072 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4073 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4074 } else if (VT.isFloatingPoint() &&
4076 Vec = DAG.getConstantFP(+0.0, dl, VT);
4077 } else if (VT.getVectorElementType() == MVT::i1) {
4078 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4079 "Unexpected vector type");
4080 Vec = DAG.getConstant(0, dl, VT);
4081 } else {
4082 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4083 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4084 }
4085 return DAG.getBitcast(VT, Vec);
4086}
4087
4088// Helper to determine if the ops are all the extracted subvectors come from a
4089// single source. If we allow commute they don't have to be in order (Lo/Hi).
4090static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4091 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4092 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4093 LHS.getValueType() != RHS.getValueType() ||
4094 LHS.getOperand(0) != RHS.getOperand(0))
4095 return SDValue();
4096
4097 SDValue Src = LHS.getOperand(0);
4098 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4099 return SDValue();
4100
4101 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4102 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4103 RHS.getConstantOperandAPInt(1) == NumElts) ||
4104 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4105 LHS.getConstantOperandAPInt(1) == NumElts))
4106 return Src;
4107
4108 return SDValue();
4109}
4110
4111static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4112 const SDLoc &dl, unsigned vectorWidth) {
4113 EVT VT = Vec.getValueType();
4114 EVT ElVT = VT.getVectorElementType();
4115 unsigned ResultNumElts =
4116 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4117 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4118
4119 assert(ResultVT.getSizeInBits() == vectorWidth &&
4120 "Illegal subvector extraction");
4121
4122 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4123 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4124 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4125
4126 // This is the index of the first element of the vectorWidth-bit chunk
4127 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4128 IdxVal &= ~(ElemsPerChunk - 1);
4129
4130 // If the input is a buildvector just emit a smaller one.
4131 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4132 return DAG.getBuildVector(ResultVT, dl,
4133 Vec->ops().slice(IdxVal, ElemsPerChunk));
4134
4135 // Check if we're extracting the upper undef of a widening pattern.
4136 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4137 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4138 isNullConstant(Vec.getOperand(2)))
4139 return DAG.getUNDEF(ResultVT);
4140
4141 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4142}
4143
4144/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4145/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4146/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4147/// instructions or a simple subregister reference. Idx is an index in the
4148/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4149/// lowering EXTRACT_VECTOR_ELT operations easier.
4150static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4151 SelectionDAG &DAG, const SDLoc &dl) {
4153 Vec.getValueType().is512BitVector()) &&
4154 "Unexpected vector size!");
4155 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4156}
4157
4158/// Generate a DAG to grab 256-bits from a 512-bit vector.
4159static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4160 SelectionDAG &DAG, const SDLoc &dl) {
4161 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4162 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4163}
4164
4165static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4166 SelectionDAG &DAG, const SDLoc &dl,
4167 unsigned vectorWidth) {
4168 assert((vectorWidth == 128 || vectorWidth == 256) &&
4169 "Unsupported vector width");
4170 // Inserting UNDEF is Result
4171 if (Vec.isUndef())
4172 return Result;
4173
4174 // Insert the relevant vectorWidth bits.
4175 EVT VT = Vec.getValueType();
4176 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4177 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4178
4179 // This is the index of the first element of the vectorWidth-bit chunk
4180 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4181 IdxVal &= ~(ElemsPerChunk - 1);
4182 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4183}
4184
4185/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4186/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4187/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4188/// simple superregister reference. Idx is an index in the 128 bits
4189/// we want. It need not be aligned to a 128-bit boundary. That makes
4190/// lowering INSERT_VECTOR_ELT operations easier.
4191static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4192 SelectionDAG &DAG, const SDLoc &dl) {
4193 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4194 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4195}
4196
4197/// Widen a vector to a larger size with the same scalar type, with the new
4198/// elements either zero or undef.
4199static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4200 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4201 const SDLoc &dl) {
4202 EVT VecVT = Vec.getValueType();
4204 VecVT.getScalarType() == VT.getScalarType() &&
4205 "Unsupported vector widening type");
4206 // If the upper 128-bits of a build vector are already undef/zero, then try to
4207 // widen from the lower 128-bits.
4208 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4209 unsigned NumSrcElts = VecVT.getVectorNumElements();
4210 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4211 if (all_of(Hi, [&](SDValue V) {
4212 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4213 }))
4214 Vec = extract128BitVector(Vec, 0, DAG, dl);
4215 }
4216 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4217 : DAG.getUNDEF(VT);
4218 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4219}
4220
4221/// Widen a vector to a larger size with the same scalar type, with the new
4222/// elements either zero or undef.
4223static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4224 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4225 const SDLoc &dl, unsigned WideSizeInBits) {
4226 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4227 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4228 "Unsupported vector widening type");
4229 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4230 MVT SVT = Vec.getSimpleValueType().getScalarType();
4231 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4232 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4233}
4234
4235/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4236/// and bitcast with integer types.
4237static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4238 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4239 unsigned NumElts = VT.getVectorNumElements();
4240 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4241 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4242 return VT;
4243}
4244
4245/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4246/// bitcast with integer types.
4247static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4248 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4249 const SDLoc &dl) {
4250 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4251 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4252}
4253
4254// Helper function to collect subvector ops that are concatenated together,
4255// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4256// The subvectors in Ops are guaranteed to be the same type.
4258 SelectionDAG &DAG) {
4259 assert(Ops.empty() && "Expected an empty ops vector");
4260
4261 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4262 Ops.append(N->op_begin(), N->op_end());
4263 return true;
4264 }
4265
4266 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4267 SDValue Src = N->getOperand(0);
4268 SDValue Sub = N->getOperand(1);
4269 const APInt &Idx = N->getConstantOperandAPInt(2);
4270 EVT VT = Src.getValueType();
4271 EVT SubVT = Sub.getValueType();
4272
4273 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4274 // insert_subvector(undef, x, lo)
4275 if (Idx == 0 && Src.isUndef()) {
4276 Ops.push_back(Sub);
4277 Ops.push_back(DAG.getUNDEF(SubVT));
4278 return true;
4279 }
4280 if (Idx == (VT.getVectorNumElements() / 2)) {
4281 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4282 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4283 Src.getOperand(1).getValueType() == SubVT &&
4284 isNullConstant(Src.getOperand(2))) {
4285 // Attempt to recurse into inner (matching) concats.
4286 SDValue Lo = Src.getOperand(1);
4287 SDValue Hi = Sub;
4288 SmallVector<SDValue, 2> LoOps, HiOps;
4289 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4290 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4291 LoOps.size() == HiOps.size()) {
4292 Ops.append(LoOps);
4293 Ops.append(HiOps);
4294 return true;
4295 }
4296 Ops.push_back(Lo);
4297 Ops.push_back(Hi);
4298 return true;
4299 }
4300 // insert_subvector(x, extract_subvector(x, lo), hi)
4301 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4302 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4303 Ops.append(2, Sub);
4304 return true;
4305 }
4306 // insert_subvector(undef, x, hi)
4307 if (Src.isUndef()) {
4308 Ops.push_back(DAG.getUNDEF(SubVT));
4309 Ops.push_back(Sub);
4310 return true;
4311 }
4312 }
4313 }
4314 }
4315
4316 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4317 EVT VT = N->getValueType(0);
4318 SDValue Src = N->getOperand(0);
4319 uint64_t Idx = N->getConstantOperandVal(1);
4320
4321 // Collect all the subvectors from the source vector and slice off the
4322 // extraction.
4324 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4325 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4326 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4327 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4328 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4329 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4330 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4331 return true;
4332 }
4333 }
4334
4335 assert(Ops.empty() && "Expected an empty ops vector");
4336 return false;
4337}
4338
4339// Helper to check if \p V can be split into subvectors and the upper subvectors
4340// are all undef. In which case return the lower subvector.
4342 SelectionDAG &DAG) {
4343 SmallVector<SDValue> SubOps;
4344 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4345 return SDValue();
4346
4347 unsigned NumSubOps = SubOps.size();
4348 unsigned HalfNumSubOps = NumSubOps / 2;
4349 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4350
4351 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4352 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4353 return SDValue();
4354
4355 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4356 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4357 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4358}
4359
4360// Helper to check if we can access all the constituent subvectors without any
4361// extract ops.
4364 return collectConcatOps(V.getNode(), Ops, DAG);
4365}
4366
4367static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4368 const SDLoc &dl) {
4369 EVT VT = Op.getValueType();
4370 unsigned NumElems = VT.getVectorNumElements();
4371 unsigned SizeInBits = VT.getSizeInBits();
4372 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4373 "Can't split odd sized vector");
4374
4376 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4377 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4378 unsigned HalfOps = SubOps.size() / 2;
4379 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4380 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4381 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4382 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4383 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4384 return std::make_pair(Lo, Hi);
4385 }
4386
4387 // If this is a splat value (with no-undefs) then use the lower subvector,
4388 // which should be a free extraction.
4389 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4390 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4391 return std::make_pair(Lo, Lo);
4392
4393 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4394 return std::make_pair(Lo, Hi);
4395}
4396
4397/// Break an operation into 2 half sized ops and then concatenate the results.
4399 unsigned NumOps = Op.getNumOperands();
4400 EVT VT = Op.getValueType();
4401
4402 // Extract the LHS Lo/Hi vectors
4403 SmallVector<SDValue> LoOps(NumOps, SDValue());
4404 SmallVector<SDValue> HiOps(NumOps, SDValue());
4405 for (unsigned I = 0; I != NumOps; ++I) {
4406 SDValue SrcOp = Op.getOperand(I);
4407 if (!SrcOp.getValueType().isVector()) {
4408 LoOps[I] = HiOps[I] = SrcOp;
4409 continue;
4410 }
4411 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4412 }
4413
4414 EVT LoVT, HiVT;
4415 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4416 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4417 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4418 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4419}
4420
4421/// Break an unary integer operation into 2 half sized ops and then
4422/// concatenate the result back.
4424 const SDLoc &dl) {
4425 // Make sure we only try to split 256/512-bit types to avoid creating
4426 // narrow vectors.
4427 [[maybe_unused]] EVT VT = Op.getValueType();
4428 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4429 Op.getOperand(0).getValueType().is512BitVector()) &&
4430 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4431 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4432 VT.getVectorNumElements() &&
4433 "Unexpected VTs!");
4434 return splitVectorOp(Op, DAG, dl);
4435}
4436
4437/// Break a binary integer operation into 2 half sized ops and then
4438/// concatenate the result back.
4440 const SDLoc &dl) {
4441 // Assert that all the types match.
4442 [[maybe_unused]] EVT VT = Op.getValueType();
4443 assert(Op.getOperand(0).getValueType() == VT &&
4444 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4445 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4446 return splitVectorOp(Op, DAG, dl);
4447}
4448
4449// Helper for splitting operands of an operation to legal target size and
4450// apply a function on each part.
4451// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4452// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4453// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4454// The argument Builder is a function that will be applied on each split part:
4455// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4456template <typename F>
4458 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4459 F Builder, bool CheckBWI = true) {
4460 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4461 unsigned NumSubs = 1;
4462 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4463 (!CheckBWI && Subtarget.useAVX512Regs())) {
4464 if (VT.getSizeInBits() > 512) {
4465 NumSubs = VT.getSizeInBits() / 512;
4466 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4467 }
4468 } else if (Subtarget.hasAVX2()) {
4469 if (VT.getSizeInBits() > 256) {
4470 NumSubs = VT.getSizeInBits() / 256;
4471 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4472 }
4473 } else {
4474 if (VT.getSizeInBits() > 128) {
4475 NumSubs = VT.getSizeInBits() / 128;
4476 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4477 }
4478 }
4479
4480 if (NumSubs == 1)
4481 return Builder(DAG, DL, Ops);
4482
4484 for (unsigned i = 0; i != NumSubs; ++i) {
4486 for (SDValue Op : Ops) {
4487 EVT OpVT = Op.getValueType();
4488 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4489 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4490 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4491 }
4492 Subs.push_back(Builder(DAG, DL, SubOps));
4493 }
4494 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4495}
4496
4497// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4498// targets.
4499static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4501 const X86Subtarget &Subtarget) {
4502 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4503 MVT SVT = VT.getScalarType();
4504
4505 // If we have a 32/64 splatted constant, splat it to DstTy to
4506 // encourage a foldable broadcast'd operand.
4507 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4508 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4509 // AVX512 broadcasts 32/64-bit operands.
4510 // TODO: Support float once getAVX512Node is used by fp-ops.
4511 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4513 return SDValue();
4514 // If we're not widening, don't bother if we're not bitcasting.
4515 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4516 return SDValue();
4517 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4518 APInt SplatValue, SplatUndef;
4519 unsigned SplatBitSize;
4520 bool HasAnyUndefs;
4521 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4522 HasAnyUndefs, OpEltSizeInBits) &&
4523 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4524 return DAG.getConstant(SplatValue, DL, DstVT);
4525 }
4526 return SDValue();
4527 };
4528
4529 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4530
4531 MVT DstVT = VT;
4532 if (Widen)
4533 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4534
4535 // Canonicalize src operands.
4536 SmallVector<SDValue> SrcOps(Ops);
4537 for (SDValue &Op : SrcOps) {
4538 MVT OpVT = Op.getSimpleValueType();
4539 // Just pass through scalar operands.
4540 if (!OpVT.isVector())
4541 continue;
4542 assert(OpVT == VT && "Vector type mismatch");
4543
4544 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4545 Op = BroadcastOp;
4546 continue;
4547 }
4548
4549 // Just widen the subvector by inserting into an undef wide vector.
4550 if (Widen)
4551 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4552 }
4553
4554 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4555
4556 // Perform the 512-bit op then extract the bottom subvector.
4557 if (Widen)
4558 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4559 return Res;
4560}
4561
4562/// Insert i1-subvector to i1-vector.
4564 const X86Subtarget &Subtarget) {
4565
4566 SDLoc dl(Op);
4567 SDValue Vec = Op.getOperand(0);
4568 SDValue SubVec = Op.getOperand(1);
4569 SDValue Idx = Op.getOperand(2);
4570 unsigned IdxVal = Op.getConstantOperandVal(2);
4571
4572 // Inserting undef is a nop. We can just return the original vector.
4573 if (SubVec.isUndef())
4574 return Vec;
4575
4576 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4577 return Op;
4578
4579 MVT OpVT = Op.getSimpleValueType();
4580 unsigned NumElems = OpVT.getVectorNumElements();
4581 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4582
4583 // Extend to natively supported kshift.
4584 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4585
4586 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4587 // if necessary.
4588 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4589 // May need to promote to a legal type.
4590 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4591 DAG.getConstant(0, dl, WideOpVT),
4592 SubVec, Idx);
4593 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4594 }
4595
4596 MVT SubVecVT = SubVec.getSimpleValueType();
4597 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4598 assert(IdxVal + SubVecNumElems <= NumElems &&
4599 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4600 "Unexpected index value in INSERT_SUBVECTOR");
4601
4602 SDValue Undef = DAG.getUNDEF(WideOpVT);
4603
4604 if (IdxVal == 0) {
4605 // Zero lower bits of the Vec
4606 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4607 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4608 ZeroIdx);
4609 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4610 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4611 // Merge them together, SubVec should be zero extended.
4612 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4613 DAG.getConstant(0, dl, WideOpVT),
4614 SubVec, ZeroIdx);
4615 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4616 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4617 }
4618
4619 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4620 Undef, SubVec, ZeroIdx);
4621
4622 if (Vec.isUndef()) {
4623 assert(IdxVal != 0 && "Unexpected index");
4624 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4625 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4626 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4627 }
4628
4630 assert(IdxVal != 0 && "Unexpected index");
4631 // If upper elements of Vec are known undef, then just shift into place.
4632 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4633 [](SDValue V) { return V.isUndef(); })) {
4634 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4635 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4636 } else {
4637 NumElems = WideOpVT.getVectorNumElements();
4638 unsigned ShiftLeft = NumElems - SubVecNumElems;
4639 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4640 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4641 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4642 if (ShiftRight != 0)
4643 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4644 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4645 }
4646 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4647 }
4648
4649 // Simple case when we put subvector in the upper part
4650 if (IdxVal + SubVecNumElems == NumElems) {
4651 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4652 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4653 if (SubVecNumElems * 2 == NumElems) {
4654 // Special case, use legal zero extending insert_subvector. This allows
4655 // isel to optimize when bits are known zero.
4656 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4657 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4658 DAG.getConstant(0, dl, WideOpVT),
4659 Vec, ZeroIdx);
4660 } else {
4661 // Otherwise use explicit shifts to zero the bits.
4662 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4663 Undef, Vec, ZeroIdx);
4664 NumElems = WideOpVT.getVectorNumElements();
4665 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4666 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4667 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4668 }
4669 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4670 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4671 }
4672
4673 // Inserting into the middle is more complicated.
4674
4675 NumElems = WideOpVT.getVectorNumElements();
4676
4677 // Widen the vector if needed.
4678 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4679
4680 unsigned ShiftLeft = NumElems - SubVecNumElems;
4681 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4682
4683 // Do an optimization for the most frequently used types.
4684 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4685 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4686 Mask0.flipAllBits();
4687 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4688 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4689 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4690 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4691 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4692 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4693 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4694 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4695
4696 // Reduce to original width if needed.
4697 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4698 }
4699
4700 // Clear the upper bits of the subvector and move it to its insert position.
4701 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4702 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4703 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4704 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4705
4706 // Isolate the bits below the insertion point.
4707 unsigned LowShift = NumElems - IdxVal;
4708 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4709 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4710 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4711 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4712
4713 // Isolate the bits after the last inserted bit.
4714 unsigned HighShift = IdxVal + SubVecNumElems;
4715 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4716 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4717 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4718 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4719
4720 // Now OR all 3 pieces together.
4721 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4722 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4723
4724 // Reduce to original width if needed.
4725 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4726}
4727
4729 const SDLoc &dl) {
4730 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4731 EVT SubVT = V1.getValueType();
4732 EVT SubSVT = SubVT.getScalarType();
4733 unsigned SubNumElts = SubVT.getVectorNumElements();
4734 unsigned SubVectorWidth = SubVT.getSizeInBits();
4735 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4736 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4737 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4738}
4739
4740/// Returns a vector of specified type with all bits set.
4741/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4742/// Then bitcast to their original type, ensuring they get CSE'd.
4743static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4744 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4745 "Expected a 128/256/512-bit vector type");
4746 unsigned NumElts = VT.getSizeInBits() / 32;
4747 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4748 return DAG.getBitcast(VT, Vec);
4749}
4750
4751static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4752 SDValue In, SelectionDAG &DAG) {
4753 EVT InVT = In.getValueType();
4754 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4755
4756 // Canonicalize Opcode to general extension version.
4757 switch (Opcode) {
4758 case ISD::ANY_EXTEND:
4760 Opcode = ISD::ANY_EXTEND;
4761 break;
4762 case ISD::SIGN_EXTEND:
4764 Opcode = ISD::SIGN_EXTEND;
4765 break;
4766 case ISD::ZERO_EXTEND:
4768 Opcode = ISD::ZERO_EXTEND;
4769 break;
4770 default:
4771 llvm_unreachable("Unknown extension opcode");
4772 }
4773
4774 // For 256-bit vectors, we only need the lower (128-bit) input half.
4775 // For 512-bit vectors, we only need the lower input half or quarter.
4776 if (InVT.getSizeInBits() > 128) {
4777 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4778 "Expected VTs to be the same size!");
4779 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4780 In = extractSubVector(In, 0, DAG, DL,
4781 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4782 InVT = In.getValueType();
4783 }
4784
4785 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4786 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4787
4788 return DAG.getNode(Opcode, DL, VT, In);
4789}
4790
4791// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4792static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4793 SDValue Mask, SelectionDAG &DAG) {
4794 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4795 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4796 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4797}
4798
4800 bool Lo, bool Unary) {
4801 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4802 "Illegal vector type to unpack");
4803 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4804 int NumElts = VT.getVectorNumElements();
4805 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4806 for (int i = 0; i < NumElts; ++i) {
4807 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4808 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4809 Pos += (Unary ? 0 : NumElts * (i % 2));
4810 Pos += (Lo ? 0 : NumEltsInLane / 2);
4811 Mask.push_back(Pos);
4812 }
4813}
4814
4815/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4816/// imposed by AVX and specific to the unary pattern. Example:
4817/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4818/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4820 bool Lo) {
4821 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4822 int NumElts = VT.getVectorNumElements();
4823 for (int i = 0; i < NumElts; ++i) {
4824 int Pos = i / 2;
4825 Pos += (Lo ? 0 : NumElts / 2);
4826 Mask.push_back(Pos);
4827 }
4828}
4829
4830// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4831static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4832 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4835 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4836 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4837 int M = Mask[I];
4838 if (M < 0)
4839 continue;
4840 SDValue V = (M < NumElts) ? V1 : V2;
4841 if (V.isUndef())
4842 continue;
4843 Ops[I] = V.getOperand(M % NumElts);
4844 }
4845 return DAG.getBuildVector(VT, dl, Ops);
4846 }
4847
4848 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4849}
4850
4851/// Returns a vector_shuffle node for an unpackl operation.
4852static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4853 SDValue V1, SDValue V2) {
4855 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4856 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4857}
4858
4859/// Returns a vector_shuffle node for an unpackh operation.
4860static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4861 SDValue V1, SDValue V2) {
4863 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4864 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4865}
4866
4867/// Returns a node that packs the LHS + RHS nodes together at half width.
4868/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4869/// TODO: Add subvector splitting if/when we have a need for it.
4870static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4871 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4872 bool PackHiHalf = false) {
4873 MVT OpVT = LHS.getSimpleValueType();
4874 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4875 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4876 assert(OpVT == RHS.getSimpleValueType() &&
4877 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4878 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4879 "Unexpected PACK operand types");
4880 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4881 "Unexpected PACK result type");
4882
4883 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4884 if (EltSizeInBits == 32) {
4885 SmallVector<int> PackMask;
4886 int Offset = PackHiHalf ? 1 : 0;
4887 int NumElts = VT.getVectorNumElements();
4888 for (int I = 0; I != NumElts; I += 4) {
4889 PackMask.push_back(I + Offset);
4890 PackMask.push_back(I + Offset + 2);
4891 PackMask.push_back(I + Offset + NumElts);
4892 PackMask.push_back(I + Offset + NumElts + 2);
4893 }
4894 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4895 DAG.getBitcast(VT, RHS), PackMask);
4896 }
4897
4898 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4899 if (!PackHiHalf) {
4900 if (UsePackUS &&
4901 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4902 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4903 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4904
4905 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4906 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4907 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4908 }
4909
4910 // Fallback to sign/zero extending the requested half and pack.
4911 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4912 if (UsePackUS) {
4913 if (PackHiHalf) {
4914 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4915 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4916 } else {
4917 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4918 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4919 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4920 };
4921 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4922 };
4923
4924 if (!PackHiHalf) {
4925 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4926 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4927 }
4928 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4929 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4930 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4931}
4932
4933/// Return a vector_shuffle of the specified vector of zero or undef vector.
4934/// This produces a shuffle where the low element of V2 is swizzled into the
4935/// zero/undef vector, landing at element Idx.
4936/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4938 bool IsZero,
4939 const X86Subtarget &Subtarget,
4940 SelectionDAG &DAG) {
4941 MVT VT = V2.getSimpleValueType();
4942 SDValue V1 = IsZero
4943 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4944 int NumElems = VT.getVectorNumElements();
4945 SmallVector<int, 16> MaskVec(NumElems);
4946 for (int i = 0; i != NumElems; ++i)
4947 // If this is the insertion idx, put the low elt of V2 here.
4948 MaskVec[i] = (i == Idx) ? NumElems : i;
4949 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4950}
4951
4953 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4954 Ptr.getOpcode() == X86ISD::WrapperRIP)
4955 Ptr = Ptr.getOperand(0);
4956 return dyn_cast<ConstantPoolSDNode>(Ptr);
4957}
4958
4959// TODO: Add support for non-zero offsets.
4962 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4963 return nullptr;
4964 return CNode->getConstVal();
4965}
4966
4968 if (!Load || !ISD::isNormalLoad(Load))
4969 return nullptr;
4970 return getTargetConstantFromBasePtr(Load->getBasePtr());
4971}
4972
4975 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4976}
4977
4978const Constant *
4980 assert(LD && "Unexpected null LoadSDNode");
4981 return getTargetConstantFromNode(LD);
4982}
4983
4985 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4986 SDValue Cond = N->getOperand(0);
4987 SDValue RHS = N->getOperand(2);
4988 EVT CondVT = Cond.getValueType();
4989 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4990 CondVT.getVectorElementType() == MVT::i1 &&
4992}
4993
4994// Extract raw constant bits from constant pools.
4995static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4996 APInt &UndefElts,
4997 SmallVectorImpl<APInt> &EltBits,
4998 bool AllowWholeUndefs = true,
4999 bool AllowPartialUndefs = false) {
5000 assert(EltBits.empty() && "Expected an empty EltBits vector");
5001
5003
5004 EVT VT = Op.getValueType();
5005 unsigned SizeInBits = VT.getSizeInBits();
5006 unsigned NumElts = SizeInBits / EltSizeInBits;
5007
5008 // Can't split constant.
5009 if ((SizeInBits % EltSizeInBits) != 0)
5010 return false;
5011
5012 // Bitcast a source array of element bits to the target size.
5013 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5014 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5015 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5016 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5017 "Constant bit sizes don't match");
5018
5019 // Don't split if we don't allow undef bits.
5020 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5021 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5022 return false;
5023
5024 // If we're already the right size, don't bother bitcasting.
5025 if (NumSrcElts == NumElts) {
5026 UndefElts = UndefSrcElts;
5027 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5028 return true;
5029 }
5030
5031 // Extract all the undef/constant element data and pack into single bitsets.
5032 APInt UndefBits(SizeInBits, 0);
5033 APInt MaskBits(SizeInBits, 0);
5034
5035 for (unsigned i = 0; i != NumSrcElts; ++i) {
5036 unsigned BitOffset = i * SrcEltSizeInBits;
5037 if (UndefSrcElts[i])
5038 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5039 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5040 }
5041
5042 // Split the undef/constant single bitset data into the target elements.
5043 UndefElts = APInt(NumElts, 0);
5044 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5045
5046 for (unsigned i = 0; i != NumElts; ++i) {
5047 unsigned BitOffset = i * EltSizeInBits;
5048 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5049
5050 // Only treat an element as UNDEF if all bits are UNDEF.
5051 if (UndefEltBits.isAllOnes()) {
5052 if (!AllowWholeUndefs)
5053 return false;
5054 UndefElts.setBit(i);
5055 continue;
5056 }
5057
5058 // If only some bits are UNDEF then treat them as zero (or bail if not
5059 // supported).
5060 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5061 return false;
5062
5063 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5064 }
5065 return true;
5066 };
5067
5068 // Collect constant bits and insert into mask/undef bit masks.
5069 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5070 unsigned UndefBitIndex) {
5071 if (!Cst)
5072 return false;
5073 if (isa<UndefValue>(Cst)) {
5074 Undefs.setBit(UndefBitIndex);
5075 return true;
5076 }
5077 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5078 Mask = CInt->getValue();
5079 return true;
5080 }
5081 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5082 Mask = CFP->getValueAPF().bitcastToAPInt();
5083 return true;
5084 }
5085 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5086 Type *Ty = CDS->getType();
5088 Type *EltTy = CDS->getElementType();
5089 bool IsInteger = EltTy->isIntegerTy();
5090 bool IsFP =
5091 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5092 if (!IsInteger && !IsFP)
5093 return false;
5094 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5095 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5096 if (IsInteger)
5097 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5098 else
5099 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5100 I * EltBits);
5101 return true;
5102 }
5103 return false;
5104 };
5105
5106 // Handle UNDEFs.
5107 if (Op.isUndef()) {
5108 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5109 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5110 return CastBitData(UndefSrcElts, SrcEltBits);
5111 }
5112
5113 // Extract scalar constant bits.
5114 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5115 APInt UndefSrcElts = APInt::getZero(1);
5116 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5117 return CastBitData(UndefSrcElts, SrcEltBits);
5118 }
5119 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5120 APInt UndefSrcElts = APInt::getZero(1);
5121 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5122 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5123 return CastBitData(UndefSrcElts, SrcEltBits);
5124 }
5125
5126 // Extract constant bits from build vector.
5127 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5128 BitVector Undefs;
5129 SmallVector<APInt> SrcEltBits;
5130 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5131 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5132 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5133 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5134 if (Undefs[I])
5135 UndefSrcElts.setBit(I);
5136 return CastBitData(UndefSrcElts, SrcEltBits);
5137 }
5138 }
5139
5140 // Extract constant bits from constant pool vector.
5141 if (auto *Cst = getTargetConstantFromNode(Op)) {
5142 Type *CstTy = Cst->getType();
5143 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5144 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5145 return false;
5146
5147 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5148 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5149 if ((SizeInBits % SrcEltSizeInBits) != 0)
5150 return false;
5151
5152 APInt UndefSrcElts(NumSrcElts, 0);
5153 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5154 for (unsigned i = 0; i != NumSrcElts; ++i)
5155 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5156 UndefSrcElts, i))
5157 return false;
5158
5159 return CastBitData(UndefSrcElts, SrcEltBits);
5160 }
5161
5162 // Extract constant bits from a broadcasted constant pool scalar.
5163 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5164 EltSizeInBits <= VT.getScalarSizeInBits()) {
5165 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5166 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5167 return false;
5168
5169 SDValue Ptr = MemIntr->getBasePtr();
5171 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5172 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5173
5174 APInt UndefSrcElts(NumSrcElts, 0);
5175 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5176 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5177 if (UndefSrcElts[0])
5178 UndefSrcElts.setBits(0, NumSrcElts);
5179 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5180 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5181 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5182 return CastBitData(UndefSrcElts, SrcEltBits);
5183 }
5184 }
5185 }
5186
5187 // Extract constant bits from a subvector broadcast.
5188 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5189 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5190 SDValue Ptr = MemIntr->getBasePtr();
5191 // The source constant may be larger than the subvector broadcast,
5192 // ensure we extract the correct subvector constants.
5193 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5194 Type *CstTy = Cst->getType();
5195 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5196 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5197 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5198 (SizeInBits % SubVecSizeInBits) != 0)
5199 return false;
5200 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5201 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5202 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5203 APInt UndefSubElts(NumSubElts, 0);
5204 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5205 APInt(CstEltSizeInBits, 0));
5206 for (unsigned i = 0; i != NumSubElts; ++i) {
5207 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5208 UndefSubElts, i))
5209 return false;
5210 for (unsigned j = 1; j != NumSubVecs; ++j)
5211 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5212 }
5213 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5214 UndefSubElts);
5215 return CastBitData(UndefSubElts, SubEltBits);
5216 }
5217 }
5218
5219 // Extract a rematerialized scalar constant insertion.
5220 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5221 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5222 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5223 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5224 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5225
5226 APInt UndefSrcElts(NumSrcElts, 0);
5227 SmallVector<APInt, 64> SrcEltBits;
5228 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5229 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5230 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5231 return CastBitData(UndefSrcElts, SrcEltBits);
5232 }
5233
5234 // Insert constant bits from a base and sub vector sources.
5235 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5236 // If bitcasts to larger elements we might lose track of undefs - don't
5237 // allow any to be safe.
5238 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5239 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5240
5241 APInt UndefSrcElts, UndefSubElts;
5242 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5243 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5244 UndefSubElts, EltSubBits,
5245 AllowWholeUndefs && AllowUndefs,
5246 AllowPartialUndefs && AllowUndefs) &&
5247 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5248 UndefSrcElts, EltSrcBits,
5249 AllowWholeUndefs && AllowUndefs,
5250 AllowPartialUndefs && AllowUndefs)) {
5251 unsigned BaseIdx = Op.getConstantOperandVal(2);
5252 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5253 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5254 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5255 return CastBitData(UndefSrcElts, EltSrcBits);
5256 }
5257 }
5258
5259 // Extract constant bits from a subvector's source.
5260 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5261 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5262 EltBits, AllowWholeUndefs,
5263 AllowPartialUndefs)) {
5264 EVT SrcVT = Op.getOperand(0).getValueType();
5265 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5266 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5267 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5268 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5269 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5270 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5271 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5272
5273 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5274 if ((BaseIdx + NumSubElts) != NumSrcElts)
5275 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5276 if (BaseIdx != 0)
5277 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5278 return true;
5279 }
5280
5281 // Extract constant bits from shuffle node sources.
5282 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5283 // TODO - support shuffle through bitcasts.
5284 if (EltSizeInBits != VT.getScalarSizeInBits())
5285 return false;
5286
5287 ArrayRef<int> Mask = SVN->getMask();
5288 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5289 llvm::any_of(Mask, [](int M) { return M < 0; }))
5290 return false;
5291
5292 APInt UndefElts0, UndefElts1;
5293 SmallVector<APInt, 32> EltBits0, EltBits1;
5294 if (isAnyInRange(Mask, 0, NumElts) &&
5295 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5296 UndefElts0, EltBits0, AllowWholeUndefs,
5297 AllowPartialUndefs))
5298 return false;
5299 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5300 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5301 UndefElts1, EltBits1, AllowWholeUndefs,
5302 AllowPartialUndefs))
5303 return false;
5304
5305 UndefElts = APInt::getZero(NumElts);
5306 for (int i = 0; i != (int)NumElts; ++i) {
5307 int M = Mask[i];
5308 if (M < 0) {
5309 UndefElts.setBit(i);
5310 EltBits.push_back(APInt::getZero(EltSizeInBits));
5311 } else if (M < (int)NumElts) {
5312 if (UndefElts0[M])
5313 UndefElts.setBit(i);
5314 EltBits.push_back(EltBits0[M]);
5315 } else {
5316 if (UndefElts1[M - NumElts])
5317 UndefElts.setBit(i);
5318 EltBits.push_back(EltBits1[M - NumElts]);
5319 }
5320 }
5321 return true;
5322 }
5323
5324 return false;
5325}
5326
5327namespace llvm {
5328namespace X86 {
5329bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5330 APInt UndefElts;
5331 SmallVector<APInt, 16> EltBits;
5333 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5334 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5335 int SplatIndex = -1;
5336 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5337 if (UndefElts[i])
5338 continue;
5339 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5340 SplatIndex = -1;
5341 break;
5342 }
5343 SplatIndex = i;
5344 }
5345 if (0 <= SplatIndex) {
5346 SplatVal = EltBits[SplatIndex];
5347 return true;
5348 }
5349 }
5350
5351 return false;
5352}
5353} // namespace X86
5354} // namespace llvm
5355
5357 unsigned MaskEltSizeInBits,
5359 APInt &UndefElts) {
5360 // Extract the raw target constant bits.
5361 SmallVector<APInt, 64> EltBits;
5362 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5363 EltBits, /* AllowWholeUndefs */ true,
5364 /* AllowPartialUndefs */ false))
5365 return false;
5366
5367 // Insert the extracted elements into the mask.
5368 for (const APInt &Elt : EltBits)
5369 RawMask.push_back(Elt.getZExtValue());
5370
5371 return true;
5372}
5373
5374static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5375 bool AllowUndefs) {
5376 APInt UndefElts;
5377 SmallVector<APInt, 64> EltBits;
5378 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5379 /*AllowWholeUndefs*/ AllowUndefs,
5380 /*AllowPartialUndefs*/ false))
5381 return false;
5382
5383 bool IsPow2OrUndef = true;
5384 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5385 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5386 return IsPow2OrUndef;
5387}
5388
5389// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5391 // TODO: don't always ignore oneuse constraints.
5392 V = peekThroughBitcasts(V);
5393 EVT VT = V.getValueType();
5394
5395 // Match not(xor X, -1) -> X.
5396 if (V.getOpcode() == ISD::XOR &&
5397 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5398 isAllOnesConstant(V.getOperand(1))))
5399 return V.getOperand(0);
5400
5401 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5402 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5403 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5404 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5405 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5406 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5407 V.getOperand(1));
5408 }
5409 }
5410
5411 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5412 if (V.getOpcode() == X86ISD::PCMPGT &&
5413 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5414 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5415 V.getOperand(0).hasOneUse()) {
5416 APInt UndefElts;
5417 SmallVector<APInt> EltBits;
5418 if (getTargetConstantBitsFromNode(V.getOperand(0),
5419 V.getScalarValueSizeInBits(), UndefElts,
5420 EltBits) &&
5421 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5422 // Don't fold min_signed_value -> (min_signed_value - 1)
5423 bool MinSigned = false;
5424 for (APInt &Elt : EltBits) {
5425 MinSigned |= Elt.isMinSignedValue();
5426 Elt -= 1;
5427 }
5428 if (!MinSigned) {
5429 SDLoc DL(V);
5430 MVT VT = V.getSimpleValueType();
5431 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5432 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5433 }
5434 }
5435 }
5436
5437 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5439 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5440 for (SDValue &CatOp : CatOps) {
5441 SDValue NotCat = IsNOT(CatOp, DAG);
5442 if (!NotCat)
5443 return SDValue();
5444 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5445 }
5446 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5447 }
5448
5449 // Match not(or(not(X),not(Y))) -> and(X, Y).
5450 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5451 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5452 // TODO: Handle cases with single NOT operand -> ANDNP
5453 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5454 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5455 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5456 DAG.getBitcast(VT, Op1));
5457 }
5458
5459 return SDValue();
5460}
5461
5462/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5463/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5464/// Note: This ignores saturation, so inputs must be checked first.
5466 bool Unary, unsigned NumStages = 1) {
5467 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5468 unsigned NumElts = VT.getVectorNumElements();
5469 unsigned NumLanes = VT.getSizeInBits() / 128;
5470 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5471 unsigned Offset = Unary ? 0 : NumElts;
5472 unsigned Repetitions = 1u << (NumStages - 1);
5473 unsigned Increment = 1u << NumStages;
5474 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5475
5476 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5477 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5478 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5479 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5480 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5481 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5482 }
5483 }
5484}
5485
5486// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5487static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5488 APInt &DemandedLHS, APInt &DemandedRHS) {
5489 int NumLanes = VT.getSizeInBits() / 128;
5490 int NumElts = DemandedElts.getBitWidth();
5491 int NumInnerElts = NumElts / 2;
5492 int NumEltsPerLane = NumElts / NumLanes;
5493 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5494
5495 DemandedLHS = APInt::getZero(NumInnerElts);
5496 DemandedRHS = APInt::getZero(NumInnerElts);
5497
5498 // Map DemandedElts to the packed operands.
5499 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5500 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5501 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5502 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5503 if (DemandedElts[OuterIdx])
5504 DemandedLHS.setBit(InnerIdx);
5505 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5506 DemandedRHS.setBit(InnerIdx);
5507 }
5508 }
5509}
5510
5511// Split the demanded elts of a HADD/HSUB node between its operands.
5512static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5513 APInt &DemandedLHS, APInt &DemandedRHS) {
5515 DemandedLHS, DemandedRHS);
5516 DemandedLHS |= DemandedLHS << 1;
5517 DemandedRHS |= DemandedRHS << 1;
5518}
5519
5520/// Calculates the shuffle mask corresponding to the target-specific opcode.
5521/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5522/// operands in \p Ops, and returns true.
5523/// Sets \p IsUnary to true if only one source is used. Note that this will set
5524/// IsUnary for shuffles which use a single input multiple times, and in those
5525/// cases it will adjust the mask to only have indices within that single input.
5526/// It is an error to call this with non-empty Mask/Ops vectors.
5527static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5529 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5530 if (!isTargetShuffle(N.getOpcode()))
5531 return false;
5532
5533 MVT VT = N.getSimpleValueType();
5534 unsigned NumElems = VT.getVectorNumElements();
5535 unsigned MaskEltSize = VT.getScalarSizeInBits();
5537 APInt RawUndefs;
5538 uint64_t ImmN;
5539
5540 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5541 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5542
5543 IsUnary = false;
5544 bool IsFakeUnary = false;
5545 switch (N.getOpcode()) {
5546 case X86ISD::BLENDI:
5547 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5548 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5549 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5550 DecodeBLENDMask(NumElems, ImmN, Mask);
5551 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5552 break;
5553 case X86ISD::SHUFP:
5554 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5555 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5556 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5557 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5558 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5559 break;
5560 case X86ISD::INSERTPS:
5561 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5562 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5563 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5564 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5565 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5566 break;
5567 case X86ISD::EXTRQI:
5568 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5569 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5570 isa<ConstantSDNode>(N.getOperand(2))) {
5571 int BitLen = N.getConstantOperandVal(1);
5572 int BitIdx = N.getConstantOperandVal(2);
5573 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5574 IsUnary = true;
5575 }
5576 break;
5577 case X86ISD::INSERTQI:
5578 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5579 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5580 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5581 isa<ConstantSDNode>(N.getOperand(3))) {
5582 int BitLen = N.getConstantOperandVal(2);
5583 int BitIdx = N.getConstantOperandVal(3);
5584 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5585 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5586 }
5587 break;
5588 case X86ISD::UNPCKH:
5589 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5590 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5591 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5592 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5593 break;
5594 case X86ISD::UNPCKL:
5595 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5596 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5597 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5598 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5599 break;
5600 case X86ISD::MOVHLPS:
5601 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5602 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5603 DecodeMOVHLPSMask(NumElems, Mask);
5604 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5605 break;
5606 case X86ISD::MOVLHPS:
5607 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5608 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5609 DecodeMOVLHPSMask(NumElems, Mask);
5610 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5611 break;
5612 case X86ISD::VALIGN:
5613 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5614 "Only 32-bit and 64-bit elements are supported!");
5615 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5616 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5617 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5618 DecodeVALIGNMask(NumElems, ImmN, Mask);
5619 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5620 Ops.push_back(N.getOperand(1));
5621 Ops.push_back(N.getOperand(0));
5622 break;
5623 case X86ISD::PALIGNR:
5624 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5625 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5626 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5627 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5628 DecodePALIGNRMask(NumElems, ImmN, Mask);
5629 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5630 Ops.push_back(N.getOperand(1));
5631 Ops.push_back(N.getOperand(0));
5632 break;
5633 case X86ISD::VSHLDQ:
5634 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5637 DecodePSLLDQMask(NumElems, ImmN, Mask);
5638 IsUnary = true;
5639 break;
5640 case X86ISD::VSRLDQ:
5641 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5642 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5643 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5644 DecodePSRLDQMask(NumElems, ImmN, Mask);
5645 IsUnary = true;
5646 break;
5647 case X86ISD::PSHUFD:
5648 case X86ISD::VPERMILPI:
5649 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5650 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5651 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5652 IsUnary = true;
5653 break;
5654 case X86ISD::PSHUFHW:
5655 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5656 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5657 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5658 IsUnary = true;
5659 break;
5660 case X86ISD::PSHUFLW:
5661 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5662 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5663 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5664 IsUnary = true;
5665 break;
5666 case X86ISD::VZEXT_MOVL:
5667 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5668 DecodeZeroMoveLowMask(NumElems, Mask);
5669 IsUnary = true;
5670 break;
5671 case X86ISD::VBROADCAST:
5672 // We only decode broadcasts of same-sized vectors, peeking through to
5673 // extracted subvectors is likely to cause hasOneUse issues with
5674 // SimplifyDemandedBits etc.
5675 if (N.getOperand(0).getValueType() == VT) {
5676 DecodeVectorBroadcast(NumElems, Mask);
5677 IsUnary = true;
5678 break;
5679 }
5680 return false;
5681 case X86ISD::VPERMILPV: {
5682 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5683 IsUnary = true;
5684 SDValue MaskNode = N.getOperand(1);
5685 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5686 RawUndefs)) {
5687 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5688 break;
5689 }
5690 return false;
5691 }
5692 case X86ISD::PSHUFB: {
5693 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5694 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5695 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5696 IsUnary = true;
5697 SDValue MaskNode = N.getOperand(1);
5698 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5699 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5700 break;
5701 }
5702 return false;
5703 }
5704 case X86ISD::VPERMI:
5705 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5706 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5707 DecodeVPERMMask(NumElems, ImmN, Mask);
5708 IsUnary = true;
5709 break;
5710 case X86ISD::MOVSS:
5711 case X86ISD::MOVSD:
5712 case X86ISD::MOVSH:
5713 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5714 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5715 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5716 break;
5717 case X86ISD::VPERM2X128:
5718 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5719 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5720 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5721 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5722 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5723 break;
5724 case X86ISD::SHUF128:
5725 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5726 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5727 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5728 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5729 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5730 break;
5731 case X86ISD::MOVSLDUP:
5732 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5733 DecodeMOVSLDUPMask(NumElems, Mask);
5734 IsUnary = true;
5735 break;
5736 case X86ISD::MOVSHDUP:
5737 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5738 DecodeMOVSHDUPMask(NumElems, Mask);
5739 IsUnary = true;
5740 break;
5741 case X86ISD::MOVDDUP:
5742 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5743 DecodeMOVDDUPMask(NumElems, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::VPERMIL2: {
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5749 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5750 SDValue MaskNode = N.getOperand(2);
5751 SDValue CtrlNode = N.getOperand(3);
5752 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5753 unsigned CtrlImm = CtrlOp->getZExtValue();
5754 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5755 RawUndefs)) {
5756 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5757 Mask);
5758 break;
5759 }
5760 }
5761 return false;
5762 }
5763 case X86ISD::VPPERM: {
5764 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5765 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5766 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5767 SDValue MaskNode = N.getOperand(2);
5768 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5769 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5770 break;
5771 }
5772 return false;
5773 }
5774 case X86ISD::VPERMV: {
5775 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5776 IsUnary = true;
5777 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5778 Ops.push_back(N.getOperand(1));
5779 SDValue MaskNode = N.getOperand(0);
5780 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5781 RawUndefs)) {
5782 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5783 break;
5784 }
5785 return false;
5786 }
5787 case X86ISD::VPERMV3: {
5788 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5789 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5790 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5791 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5792 Ops.push_back(N.getOperand(0));
5793 Ops.push_back(N.getOperand(2));
5794 SDValue MaskNode = N.getOperand(1);
5795 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5796 RawUndefs)) {
5797 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5798 break;
5799 }
5800 return false;
5801 }
5802 default:
5803 llvm_unreachable("unknown target shuffle node");
5804 }
5805
5806 // Empty mask indicates the decode failed.
5807 if (Mask.empty())
5808 return false;
5809
5810 // Check if we're getting a shuffle mask with zero'd elements.
5811 if (!AllowSentinelZero && isAnyZero(Mask))
5812 return false;
5813
5814 // If we have a fake unary shuffle, the shuffle mask is spread across two
5815 // inputs that are actually the same node. Re-map the mask to always point
5816 // into the first input.
5817 if (IsFakeUnary)
5818 for (int &M : Mask)
5819 if (M >= (int)Mask.size())
5820 M -= Mask.size();
5821
5822 // If we didn't already add operands in the opcode-specific code, default to
5823 // adding 1 or 2 operands starting at 0.
5824 if (Ops.empty()) {
5825 Ops.push_back(N.getOperand(0));
5826 if (!IsUnary || IsFakeUnary)
5827 Ops.push_back(N.getOperand(1));
5828 }
5829
5830 return true;
5831}
5832
5833// Wrapper for getTargetShuffleMask with InUnary;
5834static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5836 SmallVectorImpl<int> &Mask) {
5837 bool IsUnary;
5838 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5839}
5840
5841/// Compute whether each element of a shuffle is zeroable.
5842///
5843/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5844/// Either it is an undef element in the shuffle mask, the element of the input
5845/// referenced is undef, or the element of the input referenced is known to be
5846/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5847/// as many lanes with this technique as possible to simplify the remaining
5848/// shuffle.
5850 SDValue V1, SDValue V2,
5851 APInt &KnownUndef, APInt &KnownZero) {
5852 int Size = Mask.size();
5853 KnownUndef = KnownZero = APInt::getZero(Size);
5854
5855 V1 = peekThroughBitcasts(V1);
5856 V2 = peekThroughBitcasts(V2);
5857
5858 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5859 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5860
5861 int VectorSizeInBits = V1.getValueSizeInBits();
5862 int ScalarSizeInBits = VectorSizeInBits / Size;
5863 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5864
5865 for (int i = 0; i < Size; ++i) {
5866 int M = Mask[i];
5867 // Handle the easy cases.
5868 if (M < 0) {
5869 KnownUndef.setBit(i);
5870 continue;
5871 }
5872 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5873 KnownZero.setBit(i);
5874 continue;
5875 }
5876
5877 // Determine shuffle input and normalize the mask.
5878 SDValue V = M < Size ? V1 : V2;
5879 M %= Size;
5880
5881 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5882 if (V.getOpcode() != ISD::BUILD_VECTOR)
5883 continue;
5884
5885 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5886 // the (larger) source element must be UNDEF/ZERO.
5887 if ((Size % V.getNumOperands()) == 0) {
5888 int Scale = Size / V->getNumOperands();
5889 SDValue Op = V.getOperand(M / Scale);
5890 if (Op.isUndef())
5891 KnownUndef.setBit(i);
5892 if (X86::isZeroNode(Op))
5893 KnownZero.setBit(i);
5894 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5895 APInt Val = Cst->getAPIntValue();
5896 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5897 if (Val == 0)
5898 KnownZero.setBit(i);
5899 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5900 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5901 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5902 if (Val == 0)
5903 KnownZero.setBit(i);
5904 }
5905 continue;
5906 }
5907
5908 // If the BUILD_VECTOR has more elements then all the (smaller) source
5909 // elements must be UNDEF or ZERO.
5910 if ((V.getNumOperands() % Size) == 0) {
5911 int Scale = V->getNumOperands() / Size;
5912 bool AllUndef = true;
5913 bool AllZero = true;
5914 for (int j = 0; j < Scale; ++j) {
5915 SDValue Op = V.getOperand((M * Scale) + j);
5916 AllUndef &= Op.isUndef();
5917 AllZero &= X86::isZeroNode(Op);
5918 }
5919 if (AllUndef)
5920 KnownUndef.setBit(i);
5921 if (AllZero)
5922 KnownZero.setBit(i);
5923 continue;
5924 }
5925 }
5926}
5927
5928/// Decode a target shuffle mask and inputs and see if any values are
5929/// known to be undef or zero from their inputs.
5930/// Returns true if the target shuffle mask was decoded.
5931/// FIXME: Merge this with computeZeroableShuffleElements?
5934 APInt &KnownUndef, APInt &KnownZero) {
5935 bool IsUnary;
5936 if (!isTargetShuffle(N.getOpcode()))
5937 return false;
5938
5939 MVT VT = N.getSimpleValueType();
5940 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5941 return false;
5942
5943 int Size = Mask.size();
5944 SDValue V1 = Ops[0];
5945 SDValue V2 = IsUnary ? V1 : Ops[1];
5946 KnownUndef = KnownZero = APInt::getZero(Size);
5947
5948 V1 = peekThroughBitcasts(V1);
5949 V2 = peekThroughBitcasts(V2);
5950
5951 assert((VT.getSizeInBits() % Size) == 0 &&
5952 "Illegal split of shuffle value type");
5953 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5954
5955 // Extract known constant input data.
5956 APInt UndefSrcElts[2];
5957 SmallVector<APInt, 32> SrcEltBits[2];
5958 bool IsSrcConstant[2] = {
5959 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5960 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5961 /*AllowPartialUndefs*/ false),
5962 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5963 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5964 /*AllowPartialUndefs*/ false)};
5965
5966 for (int i = 0; i < Size; ++i) {
5967 int M = Mask[i];
5968
5969 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5970 if (M < 0) {
5971 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5972 if (SM_SentinelUndef == M)
5973 KnownUndef.setBit(i);
5974 if (SM_SentinelZero == M)
5975 KnownZero.setBit(i);
5976 continue;
5977 }
5978
5979 // Determine shuffle input and normalize the mask.
5980 unsigned SrcIdx = M / Size;
5981 SDValue V = M < Size ? V1 : V2;
5982 M %= Size;
5983
5984 // We are referencing an UNDEF input.
5985 if (V.isUndef()) {
5986 KnownUndef.setBit(i);
5987 continue;
5988 }
5989
5990 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5991 // TODO: We currently only set UNDEF for integer types - floats use the same
5992 // registers as vectors and many of the scalar folded loads rely on the
5993 // SCALAR_TO_VECTOR pattern.
5994 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5995 (Size % V.getValueType().getVectorNumElements()) == 0) {
5996 int Scale = Size / V.getValueType().getVectorNumElements();
5997 int Idx = M / Scale;
5998 if (Idx != 0 && !VT.isFloatingPoint())
5999 KnownUndef.setBit(i);
6000 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6001 KnownZero.setBit(i);
6002 continue;
6003 }
6004
6005 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6006 // base vectors.
6007 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6008 SDValue Vec = V.getOperand(0);
6009 int NumVecElts = Vec.getValueType().getVectorNumElements();
6010 if (Vec.isUndef() && Size == NumVecElts) {
6011 int Idx = V.getConstantOperandVal(2);
6012 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6013 if (M < Idx || (Idx + NumSubElts) <= M)
6014 KnownUndef.setBit(i);
6015 }
6016 continue;
6017 }
6018
6019 // Attempt to extract from the source's constant bits.
6020 if (IsSrcConstant[SrcIdx]) {
6021 if (UndefSrcElts[SrcIdx][M])
6022 KnownUndef.setBit(i);
6023 else if (SrcEltBits[SrcIdx][M] == 0)
6024 KnownZero.setBit(i);
6025 }
6026 }
6027
6028 assert(VT.getVectorNumElements() == (unsigned)Size &&
6029 "Different mask size from vector size!");
6030 return true;
6031}
6032
6033// Replace target shuffle mask elements with known undef/zero sentinels.
6035 const APInt &KnownUndef,
6036 const APInt &KnownZero,
6037 bool ResolveKnownZeros= true) {
6038 unsigned NumElts = Mask.size();
6039 assert(KnownUndef.getBitWidth() == NumElts &&
6040 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6041
6042 for (unsigned i = 0; i != NumElts; ++i) {
6043 if (KnownUndef[i])
6044 Mask[i] = SM_SentinelUndef;
6045 else if (ResolveKnownZeros && KnownZero[i])
6046 Mask[i] = SM_SentinelZero;
6047 }
6048}
6049
6050// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6052 APInt &KnownUndef,
6053 APInt &KnownZero) {
6054 unsigned NumElts = Mask.size();
6055 KnownUndef = KnownZero = APInt::getZero(NumElts);
6056
6057 for (unsigned i = 0; i != NumElts; ++i) {
6058 int M = Mask[i];
6059 if (SM_SentinelUndef == M)
6060 KnownUndef.setBit(i);
6061 if (SM_SentinelZero == M)
6062 KnownZero.setBit(i);
6063 }
6064}
6065
6066// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6068 SDValue Cond, bool IsBLENDV = false) {
6069 EVT CondVT = Cond.getValueType();
6070 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6071 unsigned NumElts = CondVT.getVectorNumElements();
6072
6073 APInt UndefElts;
6074 SmallVector<APInt, 32> EltBits;
6075 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6076 /*AllowWholeUndefs*/ true,
6077 /*AllowPartialUndefs*/ false))
6078 return false;
6079
6080 Mask.resize(NumElts, SM_SentinelUndef);
6081
6082 for (int i = 0; i != (int)NumElts; ++i) {
6083 Mask[i] = i;
6084 // Arbitrarily choose from the 2nd operand if the select condition element
6085 // is undef.
6086 // TODO: Can we do better by matching patterns such as even/odd?
6087 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6088 (IsBLENDV && EltBits[i].isNonNegative()))
6089 Mask[i] += NumElts;
6090 }
6091
6092 return true;
6093}
6094
6095// Forward declaration (for getFauxShuffleMask recursive check).
6096static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6099 const SelectionDAG &DAG, unsigned Depth,
6100 bool ResolveKnownElts);
6101
6102// Attempt to decode ops that could be represented as a shuffle mask.
6103// The decoded shuffle mask may contain a different number of elements to the
6104// destination value type.
6105// TODO: Merge into getTargetShuffleInputs()
6106static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6109 const SelectionDAG &DAG, unsigned Depth,
6110 bool ResolveKnownElts) {
6111 Mask.clear();
6112 Ops.clear();
6113
6114 MVT VT = N.getSimpleValueType();
6115 unsigned NumElts = VT.getVectorNumElements();
6116 unsigned NumSizeInBits = VT.getSizeInBits();
6117 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6118 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6119 return false;
6120 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6121 unsigned NumSizeInBytes = NumSizeInBits / 8;
6122 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6123
6124 unsigned Opcode = N.getOpcode();
6125 switch (Opcode) {
6126 case ISD::VECTOR_SHUFFLE: {
6127 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6128 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6129 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6130 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6131 Ops.push_back(N.getOperand(0));
6132 Ops.push_back(N.getOperand(1));
6133 return true;
6134 }
6135 return false;
6136 }
6137 case ISD::AND:
6138 case X86ISD::ANDNP: {
6139 // Attempt to decode as a per-byte mask.
6140 APInt UndefElts;
6141 SmallVector<APInt, 32> EltBits;
6142 SDValue N0 = N.getOperand(0);
6143 SDValue N1 = N.getOperand(1);
6144 bool IsAndN = (X86ISD::ANDNP == Opcode);
6145 uint64_t ZeroMask = IsAndN ? 255 : 0;
6146 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6147 /*AllowWholeUndefs*/ false,
6148 /*AllowPartialUndefs*/ false))
6149 return false;
6150 // We can't assume an undef src element gives an undef dst - the other src
6151 // might be zero.
6152 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6153 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6154 const APInt &ByteBits = EltBits[i];
6155 if (ByteBits != 0 && ByteBits != 255)
6156 return false;
6157 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6158 }
6159 Ops.push_back(IsAndN ? N1 : N0);
6160 return true;
6161 }
6162 case ISD::OR: {
6163 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6164 // is a valid shuffle index.
6165 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6166 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6167 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6168 return false;
6169
6170 SmallVector<int, 64> SrcMask0, SrcMask1;
6171 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6174 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6175 Depth + 1, true) ||
6176 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6177 Depth + 1, true))
6178 return false;
6179
6180 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6181 SmallVector<int, 64> Mask0, Mask1;
6182 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6183 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6184 for (int i = 0; i != (int)MaskSize; ++i) {
6185 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6186 // loops converting between OR and BLEND shuffles due to
6187 // canWidenShuffleElements merging away undef elements, meaning we
6188 // fail to recognise the OR as the undef element isn't known zero.
6189 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6190 Mask.push_back(SM_SentinelZero);
6191 else if (Mask1[i] == SM_SentinelZero)
6192 Mask.push_back(i);
6193 else if (Mask0[i] == SM_SentinelZero)
6194 Mask.push_back(i + MaskSize);
6195 else
6196 return false;
6197 }
6198 Ops.push_back(N.getOperand(0));
6199 Ops.push_back(N.getOperand(1));
6200 return true;
6201 }
6202 case ISD::CONCAT_VECTORS: {
6203 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6204 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6205 if (NumBitsPerElt == 64) {
6206 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6207 for (unsigned M = 0; M != NumSubElts; ++M)
6208 Mask.push_back((I * NumElts) + M);
6209 Ops.push_back(N.getOperand(I));
6210 }
6211 return true;
6212 }
6213 return false;
6214 }
6215 case ISD::INSERT_SUBVECTOR: {
6216 SDValue Src = N.getOperand(0);
6217 SDValue Sub = N.getOperand(1);
6218 EVT SubVT = Sub.getValueType();
6219 unsigned NumSubElts = SubVT.getVectorNumElements();
6220 uint64_t InsertIdx = N.getConstantOperandVal(2);
6221 // Subvector isn't demanded - just return the base vector.
6222 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6223 Mask.resize(NumElts);
6224 std::iota(Mask.begin(), Mask.end(), 0);
6225 Ops.push_back(Src);
6226 return true;
6227 }
6228 // Handle CONCAT(SUB0, SUB1).
6229 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6230 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6231 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6232 Src.getOperand(0).isUndef() &&
6233 Src.getOperand(1).getValueType() == SubVT &&
6234 Src.getConstantOperandVal(2) == 0 &&
6235 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6236 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6237 Mask.resize(NumElts);
6238 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6239 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6240 Ops.push_back(Src.getOperand(1));
6241 Ops.push_back(Sub);
6242 return true;
6243 }
6244 if (!N->isOnlyUserOf(Sub.getNode()))
6245 return false;
6246
6247 SmallVector<int, 64> SubMask;
6248 SmallVector<SDValue, 2> SubInputs;
6250 EVT SubSrcVT = SubSrc.getValueType();
6251 if (!SubSrcVT.isVector())
6252 return false;
6253
6254 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6255 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6256 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6257 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6258 SDValue SubSrcSrc = SubSrc.getOperand(0);
6259 unsigned NumSubSrcSrcElts =
6260 SubSrcSrc.getValueType().getVectorNumElements();
6261 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6262 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6263 "Subvector valuetype mismatch");
6264 InsertIdx *= (MaxElts / NumElts);
6265 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6266 NumSubElts *= (MaxElts / NumElts);
6267 bool SrcIsUndef = Src.isUndef();
6268 for (int i = 0; i != (int)MaxElts; ++i)
6269 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6270 for (int i = 0; i != (int)NumSubElts; ++i)
6271 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6272 if (!SrcIsUndef)
6273 Ops.push_back(Src);
6274 Ops.push_back(SubSrcSrc);
6275 return true;
6276 }
6277
6278 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6279 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6280 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6281 Depth + 1, ResolveKnownElts))
6282 return false;
6283
6284 // Subvector shuffle inputs must not be larger than the subvector.
6285 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6286 return SubVT.getFixedSizeInBits() <
6287 SubInput.getValueSizeInBits().getFixedValue();
6288 }))
6289 return false;
6290
6291 if (SubMask.size() != NumSubElts) {
6292 assert(((SubMask.size() % NumSubElts) == 0 ||
6293 (NumSubElts % SubMask.size()) == 0) &&
6294 "Illegal submask scale");
6295 if ((NumSubElts % SubMask.size()) == 0) {
6296 int Scale = NumSubElts / SubMask.size();
6297 SmallVector<int, 64> ScaledSubMask;
6298 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6299 SubMask = ScaledSubMask;
6300 } else {
6301 int Scale = SubMask.size() / NumSubElts;
6302 NumSubElts = SubMask.size();
6303 NumElts *= Scale;
6304 InsertIdx *= Scale;
6305 }
6306 }
6307 Ops.push_back(Src);
6308 Ops.append(SubInputs.begin(), SubInputs.end());
6309 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6310 Mask.append(NumElts, SM_SentinelZero);
6311 else
6312 for (int i = 0; i != (int)NumElts; ++i)
6313 Mask.push_back(i);
6314 for (int i = 0; i != (int)NumSubElts; ++i) {
6315 int M = SubMask[i];
6316 if (0 <= M) {
6317 int InputIdx = M / NumSubElts;
6318 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6319 }
6320 Mask[i + InsertIdx] = M;
6321 }
6322 return true;
6323 }
6324 case X86ISD::PINSRB:
6325 case X86ISD::PINSRW:
6328 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6329 // vector, for matching src/dst vector types.
6330 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6331
6332 unsigned DstIdx = 0;
6333 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6334 // Check we have an in-range constant insertion index.
6335 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6336 N.getConstantOperandAPInt(2).uge(NumElts))
6337 return false;
6338 DstIdx = N.getConstantOperandVal(2);
6339
6340 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6341 if (X86::isZeroNode(Scl)) {
6342 Ops.push_back(N.getOperand(0));
6343 for (unsigned i = 0; i != NumElts; ++i)
6344 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6345 return true;
6346 }
6347 }
6348
6349 // Peek through trunc/aext/zext/bitcast.
6350 // TODO: aext shouldn't require SM_SentinelZero padding.
6351 // TODO: handle shift of scalars.
6352 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6353 while (Scl.getOpcode() == ISD::TRUNCATE ||
6354 Scl.getOpcode() == ISD::ANY_EXTEND ||
6355 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6356 (Scl.getOpcode() == ISD::BITCAST &&
6359 Scl = Scl.getOperand(0);
6360 MinBitsPerElt =
6361 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6362 }
6363 if ((MinBitsPerElt % 8) != 0)
6364 return false;
6365
6366 // Attempt to find the source vector the scalar was extracted from.
6367 SDValue SrcExtract;
6368 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6369 Scl.getOpcode() == X86ISD::PEXTRW ||
6370 Scl.getOpcode() == X86ISD::PEXTRB) &&
6371 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6372 SrcExtract = Scl;
6373 }
6374 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6375 return false;
6376
6377 SDValue SrcVec = SrcExtract.getOperand(0);
6378 EVT SrcVT = SrcVec.getValueType();
6379 if (!SrcVT.getScalarType().isByteSized())
6380 return false;
6381 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6382 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6383 unsigned DstByte = DstIdx * NumBytesPerElt;
6384 MinBitsPerElt =
6385 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6386
6387 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6388 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6389 Ops.push_back(SrcVec);
6390 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6391 } else {
6392 Ops.push_back(SrcVec);
6393 Ops.push_back(N.getOperand(0));
6394 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6395 Mask.push_back(NumSizeInBytes + i);
6396 }
6397
6398 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6399 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6400 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6401 Mask[DstByte + i] = SrcByte + i;
6402 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6403 Mask[DstByte + i] = SM_SentinelZero;
6404 return true;
6405 }
6406 case X86ISD::PACKSS:
6407 case X86ISD::PACKUS: {
6408 SDValue N0 = N.getOperand(0);
6409 SDValue N1 = N.getOperand(1);
6410 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6411 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6412 "Unexpected input value type");
6413
6414 APInt EltsLHS, EltsRHS;
6415 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6416
6417 // If we know input saturation won't happen (or we don't care for particular
6418 // lanes), we can treat this as a truncation shuffle.
6419 bool Offset0 = false, Offset1 = false;
6420 if (Opcode == X86ISD::PACKSS) {
6421 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6422 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6423 (!(N1.isUndef() || EltsRHS.isZero()) &&
6424 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6425 return false;
6426 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6427 // PACKSS then it was likely being used for sign-extension for a
6428 // truncation, so just peek through and adjust the mask accordingly.
6429 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6430 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6431 Offset0 = true;
6432 N0 = N0.getOperand(0);
6433 }
6434 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6435 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6436 Offset1 = true;
6437 N1 = N1.getOperand(0);
6438 }
6439 } else {
6440 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6441 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6442 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6443 (!(N1.isUndef() || EltsRHS.isZero()) &&
6444 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6445 return false;
6446 }
6447
6448 bool IsUnary = (N0 == N1);
6449
6450 Ops.push_back(N0);
6451 if (!IsUnary)
6452 Ops.push_back(N1);
6453
6454 createPackShuffleMask(VT, Mask, IsUnary);
6455
6456 if (Offset0 || Offset1) {
6457 for (int &M : Mask)
6458 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6459 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6460 ++M;
6461 }
6462 return true;
6463 }
6464 case ISD::VSELECT:
6465 case X86ISD::BLENDV: {
6466 SDValue Cond = N.getOperand(0);
6467 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6468 Ops.push_back(N.getOperand(1));
6469 Ops.push_back(N.getOperand(2));
6470 return true;
6471 }
6472 return false;
6473 }
6474 case X86ISD::VTRUNC: {
6475 SDValue Src = N.getOperand(0);
6476 EVT SrcVT = Src.getValueType();
6477 if (SrcVT.getSizeInBits() != NumSizeInBits)
6478 return false;
6479 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6480 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6481 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6482 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6483 for (unsigned i = 0; i != NumSrcElts; ++i)
6484 Mask.push_back(i * Scale);
6485 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6486 Ops.push_back(Src);
6487 return true;
6488 }
6489 case ISD::SHL:
6490 case ISD::SRL: {
6491 APInt UndefElts;
6492 SmallVector<APInt, 32> EltBits;
6493 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6494 UndefElts, EltBits,
6495 /*AllowWholeUndefs*/ true,
6496 /*AllowPartialUndefs*/ false))
6497 return false;
6498
6499 // We can only decode 'whole byte' bit shifts as shuffles.
6500 for (unsigned I = 0; I != NumElts; ++I)
6501 if (DemandedElts[I] && !UndefElts[I] &&
6502 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6503 return false;
6504
6505 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6506 Ops.push_back(N.getOperand(0));
6507
6508 for (unsigned I = 0; I != NumElts; ++I) {
6509 if (!DemandedElts[I] || UndefElts[I])
6510 continue;
6511 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6512 unsigned Lo = I * NumBytesPerElt;
6513 unsigned Hi = Lo + NumBytesPerElt;
6514 // Clear mask to all zeros and insert the shifted byte indices.
6515 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6516 if (ISD::SHL == Opcode)
6517 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6518 else
6519 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6520 Lo + ByteShift);
6521 }
6522 return true;
6523 }
6524 case X86ISD::VSHLI:
6525 case X86ISD::VSRLI: {
6526 uint64_t ShiftVal = N.getConstantOperandVal(1);
6527 // Out of range bit shifts are guaranteed to be zero.
6528 if (NumBitsPerElt <= ShiftVal) {
6529 Mask.append(NumElts, SM_SentinelZero);
6530 return true;
6531 }
6532
6533 // We can only decode 'whole byte' bit shifts as shuffles.
6534 if ((ShiftVal % 8) != 0)
6535 break;
6536
6537 uint64_t ByteShift = ShiftVal / 8;
6538 Ops.push_back(N.getOperand(0));
6539
6540 // Clear mask to all zeros and insert the shifted byte indices.
6541 Mask.append(NumSizeInBytes, SM_SentinelZero);
6542
6543 if (X86ISD::VSHLI == Opcode) {
6544 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6545 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6546 Mask[i + j] = i + j - ByteShift;
6547 } else {
6548 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6549 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6550 Mask[i + j - ByteShift] = i + j;
6551 }
6552 return true;
6553 }
6554 case X86ISD::VROTLI:
6555 case X86ISD::VROTRI: {
6556 // We can only decode 'whole byte' bit rotates as shuffles.
6557 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6558 if ((RotateVal % 8) != 0)
6559 return false;
6560 Ops.push_back(N.getOperand(0));
6561 int Offset = RotateVal / 8;
6562 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6563 for (int i = 0; i != (int)NumElts; ++i) {
6564 int BaseIdx = i * NumBytesPerElt;
6565 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6566 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6567 }
6568 }
6569 return true;
6570 }
6571 case X86ISD::VBROADCAST: {
6572 SDValue Src = N.getOperand(0);
6573 if (!Src.getSimpleValueType().isVector()) {
6574 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6575 !isNullConstant(Src.getOperand(1)) ||
6576 Src.getOperand(0).getValueType().getScalarType() !=
6577 VT.getScalarType())
6578 return false;
6579 Src = Src.getOperand(0);
6580 }
6581 Ops.push_back(Src);
6582 Mask.append(NumElts, 0);
6583 return true;
6584 }
6586 SDValue Src = N.getOperand(0);
6587 EVT SrcVT = Src.getValueType();
6588 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6589
6590 // Extended source must be a simple vector.
6591 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6592 (NumBitsPerSrcElt % 8) != 0)
6593 return false;
6594
6595 // We can only handle all-signbits extensions.
6596 APInt DemandedSrcElts =
6597 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6598 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6599 return false;
6600
6601 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6602 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6603 for (unsigned I = 0; I != NumElts; ++I)
6604 Mask.append(Scale, I);
6605 Ops.push_back(Src);
6606 return true;
6607 }
6608 case ISD::ZERO_EXTEND:
6609 case ISD::ANY_EXTEND:
6612 SDValue Src = N.getOperand(0);
6613 EVT SrcVT = Src.getValueType();
6614
6615 // Extended source must be a simple vector.
6616 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6617 (SrcVT.getScalarSizeInBits() % 8) != 0)
6618 return false;
6619
6620 bool IsAnyExtend =
6621 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6622 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6623 IsAnyExtend, Mask);
6624 Ops.push_back(Src);
6625 return true;
6626 }
6627 }
6628
6629 return false;
6630}
6631
6632/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6634 SmallVectorImpl<int> &Mask) {
6635 int MaskWidth = Mask.size();
6636 SmallVector<SDValue, 16> UsedInputs;
6637 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6638 int lo = UsedInputs.size() * MaskWidth;
6639 int hi = lo + MaskWidth;
6640
6641 // Strip UNDEF input usage.
6642 if (Inputs[i].isUndef())
6643 for (int &M : Mask)
6644 if ((lo <= M) && (M < hi))
6645 M = SM_SentinelUndef;
6646
6647 // Check for unused inputs.
6648 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6649 for (int &M : Mask)
6650 if (lo <= M)
6651 M -= MaskWidth;
6652 continue;
6653 }
6654
6655 // Check for repeated inputs.
6656 bool IsRepeat = false;
6657 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6658 if (UsedInputs[j] != Inputs[i])
6659 continue;
6660 for (int &M : Mask)
6661 if (lo <= M)
6662 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6663 IsRepeat = true;
6664 break;
6665 }
6666 if (IsRepeat)
6667 continue;
6668
6669 UsedInputs.push_back(Inputs[i]);
6670 }
6671 Inputs = UsedInputs;
6672}
6673
6674/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6675/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6676/// Returns true if the target shuffle mask was decoded.
6677static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6680 APInt &KnownUndef, APInt &KnownZero,
6681 const SelectionDAG &DAG, unsigned Depth,
6682 bool ResolveKnownElts) {
6684 return false; // Limit search depth.
6685
6686 EVT VT = Op.getValueType();
6687 if (!VT.isSimple() || !VT.isVector())
6688 return false;
6689
6690 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6691 if (ResolveKnownElts)
6692 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6693 return true;
6694 }
6695 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6696 ResolveKnownElts)) {
6697 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6698 return true;
6699 }
6700 return false;
6701}
6702
6703static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6706 const SelectionDAG &DAG, unsigned Depth,
6707 bool ResolveKnownElts) {
6708 APInt KnownUndef, KnownZero;
6709 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6710 KnownZero, DAG, Depth, ResolveKnownElts);
6711}
6712
6715 const SelectionDAG &DAG, unsigned Depth = 0,
6716 bool ResolveKnownElts = true) {
6717 EVT VT = Op.getValueType();
6718 if (!VT.isSimple() || !VT.isVector())
6719 return false;
6720
6721 unsigned NumElts = Op.getValueType().getVectorNumElements();
6722 APInt DemandedElts = APInt::getAllOnes(NumElts);
6723 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6724 ResolveKnownElts);
6725}
6726
6727// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6728static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6729 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6730 SelectionDAG &DAG) {
6731 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6732 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6733 "Unknown broadcast load type");
6734
6735 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6736 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6737 return SDValue();
6738
6741 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6742 SDValue Ops[] = {Mem->getChain(), Ptr};
6743 SDValue BcstLd = DAG.getMemIntrinsicNode(
6744 Opcode, DL, Tys, Ops, MemVT,
6746 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6747 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6748 return BcstLd;
6749}
6750
6751/// Returns the scalar element that will make up the i'th
6752/// element of the result of the vector shuffle.
6753static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6754 SelectionDAG &DAG, unsigned Depth) {
6756 return SDValue(); // Limit search depth.
6757
6758 EVT VT = Op.getValueType();
6759 unsigned Opcode = Op.getOpcode();
6760 unsigned NumElems = VT.getVectorNumElements();
6761
6762 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6763 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6764 int Elt = SV->getMaskElt(Index);
6765
6766 if (Elt < 0)
6767 return DAG.getUNDEF(VT.getVectorElementType());
6768
6769 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6770 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6771 }
6772
6773 // Recurse into target specific vector shuffles to find scalars.
6774 if (isTargetShuffle(Opcode)) {
6775 MVT ShufVT = VT.getSimpleVT();
6776 MVT ShufSVT = ShufVT.getVectorElementType();
6777 int NumElems = (int)ShufVT.getVectorNumElements();
6778 SmallVector<int, 16> ShuffleMask;
6780 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6781 return SDValue();
6782
6783 int Elt = ShuffleMask[Index];
6784 if (Elt == SM_SentinelZero)
6785 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6786 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6787 if (Elt == SM_SentinelUndef)
6788 return DAG.getUNDEF(ShufSVT);
6789
6790 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6791 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6792 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6793 }
6794
6795 // Recurse into insert_subvector base/sub vector to find scalars.
6796 if (Opcode == ISD::INSERT_SUBVECTOR) {
6797 SDValue Vec = Op.getOperand(0);
6798 SDValue Sub = Op.getOperand(1);
6799 uint64_t SubIdx = Op.getConstantOperandVal(2);
6800 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6801
6802 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6803 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6804 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6805 }
6806
6807 // Recurse into concat_vectors sub vector to find scalars.
6808 if (Opcode == ISD::CONCAT_VECTORS) {
6809 EVT SubVT = Op.getOperand(0).getValueType();
6810 unsigned NumSubElts = SubVT.getVectorNumElements();
6811 uint64_t SubIdx = Index / NumSubElts;
6812 uint64_t SubElt = Index % NumSubElts;
6813 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6814 }
6815
6816 // Recurse into extract_subvector src vector to find scalars.
6817 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6818 SDValue Src = Op.getOperand(0);
6819 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6820 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6821 }
6822
6823 // We only peek through bitcasts of the same vector width.
6824 if (Opcode == ISD::BITCAST) {
6825 SDValue Src = Op.getOperand(0);
6826 EVT SrcVT = Src.getValueType();
6827 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6828 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6829 return SDValue();
6830 }
6831
6832 // Actual nodes that may contain scalar elements
6833
6834 // For insert_vector_elt - either return the index matching scalar or recurse
6835 // into the base vector.
6836 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6837 isa<ConstantSDNode>(Op.getOperand(2))) {
6838 if (Op.getConstantOperandAPInt(2) == Index)
6839 return Op.getOperand(1);
6840 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6841 }
6842
6843 if (Opcode == ISD::SCALAR_TO_VECTOR)
6844 return (Index == 0) ? Op.getOperand(0)
6845 : DAG.getUNDEF(VT.getVectorElementType());
6846
6847 if (Opcode == ISD::BUILD_VECTOR)
6848 return Op.getOperand(Index);
6849
6850 return SDValue();
6851}
6852
6853// Use PINSRB/PINSRW/PINSRD to create a build vector.
6855 const APInt &NonZeroMask,
6856 unsigned NumNonZero, unsigned NumZero,
6857 SelectionDAG &DAG,
6858 const X86Subtarget &Subtarget) {
6859 MVT VT = Op.getSimpleValueType();
6860 unsigned NumElts = VT.getVectorNumElements();
6861 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6862 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6863 "Illegal vector insertion");
6864
6865 SDValue V;
6866 bool First = true;
6867
6868 for (unsigned i = 0; i < NumElts; ++i) {
6869 bool IsNonZero = NonZeroMask[i];
6870 if (!IsNonZero)
6871 continue;
6872
6873 // If the build vector contains zeros or our first insertion is not the
6874 // first index then insert into zero vector to break any register
6875 // dependency else use SCALAR_TO_VECTOR.
6876 if (First) {
6877 First = false;
6878 if (NumZero || 0 != i)
6879 V = getZeroVector(VT, Subtarget, DAG, DL);
6880 else {
6881 assert(0 == i && "Expected insertion into zero-index");
6882 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6883 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6884 V = DAG.getBitcast(VT, V);
6885 continue;
6886 }
6887 }
6888 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6889 DAG.getVectorIdxConstant(i, DL));
6890 }
6891
6892 return V;
6893}
6894
6895/// Custom lower build_vector of v16i8.
6897 const APInt &NonZeroMask,
6898 unsigned NumNonZero, unsigned NumZero,
6899 SelectionDAG &DAG,
6900 const X86Subtarget &Subtarget) {
6901 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6902 return SDValue();
6903
6904 // SSE4.1 - use PINSRB to insert each byte directly.
6905 if (Subtarget.hasSSE41())
6906 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6907 DAG, Subtarget);
6908
6909 SDValue V;
6910
6911 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6912 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6913 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6914 !NonZeroMask.extractBits(2, 2).isZero()) {
6915 for (unsigned I = 0; I != 4; ++I) {
6916 if (!NonZeroMask[I])
6917 continue;
6918 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6919 if (I != 0)
6920 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6921 DAG.getConstant(I * 8, DL, MVT::i8));
6922 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6923 }
6924 assert(V && "Failed to fold v16i8 vector to zero");
6925 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6926 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6927 V = DAG.getBitcast(MVT::v8i16, V);
6928 }
6929 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6930 bool ThisIsNonZero = NonZeroMask[i];
6931 bool NextIsNonZero = NonZeroMask[i + 1];
6932 if (!ThisIsNonZero && !NextIsNonZero)
6933 continue;
6934
6935 SDValue Elt;
6936 if (ThisIsNonZero) {
6937 if (NumZero || NextIsNonZero)
6938 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6939 else
6940 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6941 }
6942
6943 if (NextIsNonZero) {
6944 SDValue NextElt = Op.getOperand(i + 1);
6945 if (i == 0 && NumZero)
6946 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6947 else
6948 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6949 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6950 DAG.getConstant(8, DL, MVT::i8));
6951 if (ThisIsNonZero)
6952 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6953 else
6954 Elt = NextElt;
6955 }
6956
6957 // If our first insertion is not the first index or zeros are needed, then
6958 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6959 // elements undefined).
6960 if (!V) {
6961 if (i != 0 || NumZero)
6962 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6963 else {
6964 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6965 V = DAG.getBitcast(MVT::v8i16, V);
6966 continue;
6967 }
6968 }
6969 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6970 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6971 DAG.getVectorIdxConstant(i / 2, DL));
6972 }
6973
6974 return DAG.getBitcast(MVT::v16i8, V);
6975}
6976
6977/// Custom lower build_vector of v8i16.
6979 const APInt &NonZeroMask,
6980 unsigned NumNonZero, unsigned NumZero,
6981 SelectionDAG &DAG,
6982 const X86Subtarget &Subtarget) {
6983 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6984 return SDValue();
6985
6986 // Use PINSRW to insert each byte directly.
6987 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6988 Subtarget);
6989}
6990
6991/// Custom lower build_vector of v4i32 or v4f32.
6993 SelectionDAG &DAG,
6994 const X86Subtarget &Subtarget) {
6995 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6996 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6997 // Because we're creating a less complicated build vector here, we may enable
6998 // further folding of the MOVDDUP via shuffle transforms.
6999 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7000 Op.getOperand(0) == Op.getOperand(2) &&
7001 Op.getOperand(1) == Op.getOperand(3) &&
7002 Op.getOperand(0) != Op.getOperand(1)) {
7003 MVT VT = Op.getSimpleValueType();
7004 MVT EltVT = VT.getVectorElementType();
7005 // Create a new build vector with the first 2 elements followed by undef
7006 // padding, bitcast to v2f64, duplicate, and bitcast back.
7007 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7008 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7009 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7010 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7011 return DAG.getBitcast(VT, Dup);
7012 }
7013
7014 // Find all zeroable elements.
7015 std::bitset<4> Zeroable, Undefs;
7016 for (int i = 0; i < 4; ++i) {
7017 SDValue Elt = Op.getOperand(i);
7018 Undefs[i] = Elt.isUndef();
7019 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7020 }
7021 assert(Zeroable.size() - Zeroable.count() > 1 &&
7022 "We expect at least two non-zero elements!");
7023
7024 // We only know how to deal with build_vector nodes where elements are either
7025 // zeroable or extract_vector_elt with constant index.
7026 SDValue FirstNonZero;
7027 unsigned FirstNonZeroIdx;
7028 for (unsigned i = 0; i < 4; ++i) {
7029 if (Zeroable[i])
7030 continue;
7031 SDValue Elt = Op.getOperand(i);
7032 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7033 !isa<ConstantSDNode>(Elt.getOperand(1)))
7034 return SDValue();
7035 // Make sure that this node is extracting from a 128-bit vector.
7036 MVT VT = Elt.getOperand(0).getSimpleValueType();
7037 if (!VT.is128BitVector())
7038 return SDValue();
7039 if (!FirstNonZero.getNode()) {
7040 FirstNonZero = Elt;
7041 FirstNonZeroIdx = i;
7042 }
7043 }
7044
7045 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7046 SDValue V1 = FirstNonZero.getOperand(0);
7047 MVT VT = V1.getSimpleValueType();
7048
7049 // See if this build_vector can be lowered as a blend with zero.
7050 SDValue Elt;
7051 unsigned EltMaskIdx, EltIdx;
7052 int Mask[4];
7053 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7054 if (Zeroable[EltIdx]) {
7055 // The zero vector will be on the right hand side.
7056 Mask[EltIdx] = EltIdx+4;
7057 continue;
7058 }
7059
7060 Elt = Op->getOperand(EltIdx);
7061 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7062 EltMaskIdx = Elt.getConstantOperandVal(1);
7063 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7064 break;
7065 Mask[EltIdx] = EltIdx;
7066 }
7067
7068 if (EltIdx == 4) {
7069 // Let the shuffle legalizer deal with blend operations.
7070 SDValue VZeroOrUndef = (Zeroable == Undefs)
7071 ? DAG.getUNDEF(VT)
7072 : getZeroVector(VT, Subtarget, DAG, DL);
7073 if (V1.getSimpleValueType() != VT)
7074 V1 = DAG.getBitcast(VT, V1);
7075 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7076 }
7077
7078 // See if we can lower this build_vector to a INSERTPS.
7079 if (!Subtarget.hasSSE41())
7080 return SDValue();
7081
7082 SDValue V2 = Elt.getOperand(0);
7083 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7084 V1 = SDValue();
7085
7086 bool CanFold = true;
7087 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7088 if (Zeroable[i])
7089 continue;
7090
7091 SDValue Current = Op->getOperand(i);
7092 SDValue SrcVector = Current->getOperand(0);
7093 if (!V1.getNode())
7094 V1 = SrcVector;
7095 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7096 }
7097
7098 if (!CanFold)
7099 return SDValue();
7100
7101 assert(V1.getNode() && "Expected at least two non-zero elements!");
7102 if (V1.getSimpleValueType() != MVT::v4f32)
7103 V1 = DAG.getBitcast(MVT::v4f32, V1);
7104 if (V2.getSimpleValueType() != MVT::v4f32)
7105 V2 = DAG.getBitcast(MVT::v4f32, V2);
7106
7107 // Ok, we can emit an INSERTPS instruction.
7108 unsigned ZMask = Zeroable.to_ulong();
7109
7110 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7111 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7112 SDValue Result =
7113 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7114 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7115 return DAG.getBitcast(VT, Result);
7116}
7117
7118/// Return a vector logical shift node.
7119static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7120 SelectionDAG &DAG, const TargetLowering &TLI,
7121 const SDLoc &dl) {
7122 assert(VT.is128BitVector() && "Unknown type for VShift");
7123 MVT ShVT = MVT::v16i8;
7124 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7125 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7126 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7127 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7128 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7129}
7130
7132 SelectionDAG &DAG) {
7133
7134 // Check if the scalar load can be widened into a vector load. And if
7135 // the address is "base + cst" see if the cst can be "absorbed" into
7136 // the shuffle mask.
7137 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7138 SDValue Ptr = LD->getBasePtr();
7139 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7140 return SDValue();
7141 EVT PVT = LD->getValueType(0);
7142 if (PVT != MVT::i32 && PVT != MVT::f32)
7143 return SDValue();
7144
7145 int FI = -1;
7146 int64_t Offset = 0;
7147 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7148 FI = FINode->getIndex();
7149 Offset = 0;
7150 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7151 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7152 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7153 Offset = Ptr.getConstantOperandVal(1);
7154 Ptr = Ptr.getOperand(0);
7155 } else {
7156 return SDValue();
7157 }
7158
7159 // FIXME: 256-bit vector instructions don't require a strict alignment,
7160 // improve this code to support it better.
7161 Align RequiredAlign(VT.getSizeInBits() / 8);
7162 SDValue Chain = LD->getChain();
7163 // Make sure the stack object alignment is at least 16 or 32.
7165 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7166 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7167 if (MFI.isFixedObjectIndex(FI)) {
7168 // Can't change the alignment. FIXME: It's possible to compute
7169 // the exact stack offset and reference FI + adjust offset instead.
7170 // If someone *really* cares about this. That's the way to implement it.
7171 return SDValue();
7172 } else {
7173 MFI.setObjectAlignment(FI, RequiredAlign);
7174 }
7175 }
7176
7177 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7178 // Ptr + (Offset & ~15).
7179 if (Offset < 0)
7180 return SDValue();
7181 if ((Offset % RequiredAlign.value()) & 3)
7182 return SDValue();
7183 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7184 if (StartOffset) {
7185 SDLoc DL(Ptr);
7186 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7187 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7188 }
7189
7190 int EltNo = (Offset - StartOffset) >> 2;
7191 unsigned NumElems = VT.getVectorNumElements();
7192
7193 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7194 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7195 LD->getPointerInfo().getWithOffset(StartOffset));
7196
7197 SmallVector<int, 8> Mask(NumElems, EltNo);
7198
7199 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7200 }
7201
7202 return SDValue();
7203}
7204
7205// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7206static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7207 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7208 auto *BaseLd = cast<LoadSDNode>(Elt);
7209 if (!BaseLd->isSimple())
7210 return false;
7211 Ld = BaseLd;
7212 ByteOffset = 0;
7213 return true;
7214 }
7215
7216 switch (Elt.getOpcode()) {
7217 case ISD::BITCAST:
7218 case ISD::TRUNCATE:
7220 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7221 case ISD::SRL:
7222 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7223 uint64_t Amt = AmtC->getZExtValue();
7224 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7225 ByteOffset += Amt / 8;
7226 return true;
7227 }
7228 }
7229 break;
7231 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7232 SDValue Src = Elt.getOperand(0);
7233 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7234 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7235 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7236 findEltLoadSrc(Src, Ld, ByteOffset)) {
7237 uint64_t Idx = IdxC->getZExtValue();
7238 ByteOffset += Idx * (SrcSizeInBits / 8);
7239 return true;
7240 }
7241 }
7242 break;
7243 }
7244
7245 return false;
7246}
7247
7248/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7249/// elements can be replaced by a single large load which has the same value as
7250/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7251///
7252/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7254 const SDLoc &DL, SelectionDAG &DAG,
7255 const X86Subtarget &Subtarget,
7256 bool IsAfterLegalize) {
7257 if ((VT.getScalarSizeInBits() % 8) != 0)
7258 return SDValue();
7259
7260 unsigned NumElems = Elts.size();
7261
7262 int LastLoadedElt = -1;
7263 APInt LoadMask = APInt::getZero(NumElems);
7264 APInt ZeroMask = APInt::getZero(NumElems);
7265 APInt UndefMask = APInt::getZero(NumElems);
7266
7267 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7268 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7269
7270 // For each element in the initializer, see if we've found a load, zero or an
7271 // undef.
7272 for (unsigned i = 0; i < NumElems; ++i) {
7273 SDValue Elt = peekThroughBitcasts(Elts[i]);
7274 if (!Elt.getNode())
7275 return SDValue();
7276 if (Elt.isUndef()) {
7277 UndefMask.setBit(i);
7278 continue;
7279 }
7281 ZeroMask.setBit(i);
7282 continue;
7283 }
7284
7285 // Each loaded element must be the correct fractional portion of the
7286 // requested vector load.
7287 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7288 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7289 return SDValue();
7290
7291 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7292 return SDValue();
7293 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7294 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7295 return SDValue();
7296
7297 LoadMask.setBit(i);
7298 LastLoadedElt = i;
7299 }
7300 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7301 NumElems &&
7302 "Incomplete element masks");
7303
7304 // Handle Special Cases - all undef or undef/zero.
7305 if (UndefMask.popcount() == NumElems)
7306 return DAG.getUNDEF(VT);
7307 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7308 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7309 : DAG.getConstantFP(0.0, DL, VT);
7310
7311 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7312 int FirstLoadedElt = LoadMask.countr_zero();
7313 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7314 EVT EltBaseVT = EltBase.getValueType();
7315 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7316 "Register/Memory size mismatch");
7317 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7318 assert(LDBase && "Did not find base load for merging consecutive loads");
7319 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7320 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7321 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7322 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7323 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7324
7325 // TODO: Support offsetting the base load.
7326 if (ByteOffsets[FirstLoadedElt] != 0)
7327 return SDValue();
7328
7329 // Check to see if the element's load is consecutive to the base load
7330 // or offset from a previous (already checked) load.
7331 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7332 LoadSDNode *Ld = Loads[EltIdx];
7333 int64_t ByteOffset = ByteOffsets[EltIdx];
7334 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7335 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7336 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7337 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7338 }
7339 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7340 EltIdx - FirstLoadedElt);
7341 };
7342
7343 // Consecutive loads can contain UNDEFS but not ZERO elements.
7344 // Consecutive loads with UNDEFs and ZEROs elements require a
7345 // an additional shuffle stage to clear the ZERO elements.
7346 bool IsConsecutiveLoad = true;
7347 bool IsConsecutiveLoadWithZeros = true;
7348 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7349 if (LoadMask[i]) {
7350 if (!CheckConsecutiveLoad(LDBase, i)) {
7351 IsConsecutiveLoad = false;
7352 IsConsecutiveLoadWithZeros = false;
7353 break;
7354 }
7355 } else if (ZeroMask[i]) {
7356 IsConsecutiveLoad = false;
7357 }
7358 }
7359
7360 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7361 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7362 assert(LDBase->isSimple() &&
7363 "Cannot merge volatile or atomic loads.");
7364 SDValue NewLd =
7365 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7366 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7367 for (auto *LD : Loads)
7368 if (LD)
7369 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7370 return NewLd;
7371 };
7372
7373 // Check if the base load is entirely dereferenceable.
7374 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7375 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7376
7377 // LOAD - all consecutive load/undefs (must start/end with a load or be
7378 // entirely dereferenceable). If we have found an entire vector of loads and
7379 // undefs, then return a large load of the entire vector width starting at the
7380 // base pointer. If the vector contains zeros, then attempt to shuffle those
7381 // elements.
7382 if (FirstLoadedElt == 0 &&
7383 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7384 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7385 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7386 return SDValue();
7387
7388 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7389 // will lower to regular temporal loads and use the cache.
7390 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7391 VT.is256BitVector() && !Subtarget.hasInt256())
7392 return SDValue();
7393
7394 if (NumElems == 1)
7395 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7396
7397 if (!ZeroMask)
7398 return CreateLoad(VT, LDBase);
7399
7400 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7401 // vector and a zero vector to clear out the zero elements.
7402 if (!IsAfterLegalize && VT.isVector()) {
7403 unsigned NumMaskElts = VT.getVectorNumElements();
7404 if ((NumMaskElts % NumElems) == 0) {
7405 unsigned Scale = NumMaskElts / NumElems;
7406 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7407 for (unsigned i = 0; i < NumElems; ++i) {
7408 if (UndefMask[i])
7409 continue;
7410 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7411 for (unsigned j = 0; j != Scale; ++j)
7412 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7413 }
7414 SDValue V = CreateLoad(VT, LDBase);
7415 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7416 : DAG.getConstantFP(0.0, DL, VT);
7417 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7418 }
7419 }
7420 }
7421
7422 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7423 if (VT.is256BitVector() || VT.is512BitVector()) {
7424 unsigned HalfNumElems = NumElems / 2;
7425 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7426 EVT HalfVT =
7427 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7428 SDValue HalfLD =
7429 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7430 DAG, Subtarget, IsAfterLegalize);
7431 if (HalfLD)
7432 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7433 HalfLD, DAG.getVectorIdxConstant(0, DL));
7434 }
7435 }
7436
7437 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7438 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7439 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7440 LoadSizeInBits == 64) &&
7441 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7442 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7443 : MVT::getIntegerVT(LoadSizeInBits);
7444 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7445 // Allow v4f32 on SSE1 only targets.
7446 // FIXME: Add more isel patterns so we can just use VT directly.
7447 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7448 VecVT = MVT::v4f32;
7449 if (TLI.isTypeLegal(VecVT)) {
7450 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7451 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7452 SDValue ResNode = DAG.getMemIntrinsicNode(
7453 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7455 for (auto *LD : Loads)
7456 if (LD)
7457 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7458 return DAG.getBitcast(VT, ResNode);
7459 }
7460 }
7461
7462 // BROADCAST - match the smallest possible repetition pattern, load that
7463 // scalar/subvector element and then broadcast to the entire vector.
7464 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7465 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7466 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7467 unsigned RepeatSize = SubElems * BaseSizeInBits;
7468 unsigned ScalarSize = std::min(RepeatSize, 64u);
7469 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7470 continue;
7471
7472 // Don't attempt a 1:N subvector broadcast - it should be caught by
7473 // combineConcatVectorOps, else will cause infinite loops.
7474 if (RepeatSize > ScalarSize && SubElems == 1)
7475 continue;
7476
7477 bool Match = true;
7478 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7479 for (unsigned i = 0; i != NumElems && Match; ++i) {
7480 if (!LoadMask[i])
7481 continue;
7482 SDValue Elt = peekThroughBitcasts(Elts[i]);
7483 if (RepeatedLoads[i % SubElems].isUndef())
7484 RepeatedLoads[i % SubElems] = Elt;
7485 else
7486 Match &= (RepeatedLoads[i % SubElems] == Elt);
7487 }
7488
7489 // We must have loads at both ends of the repetition.
7490 Match &= !RepeatedLoads.front().isUndef();
7491 Match &= !RepeatedLoads.back().isUndef();
7492 if (!Match)
7493 continue;
7494
7495 EVT RepeatVT =
7496 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7497 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7498 : EVT::getFloatingPointVT(ScalarSize);
7499 if (RepeatSize > ScalarSize)
7500 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7501 RepeatSize / ScalarSize);
7502 EVT BroadcastVT =
7503 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7504 VT.getSizeInBits() / ScalarSize);
7505 if (TLI.isTypeLegal(BroadcastVT)) {
7506 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7507 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7508 SDValue Broadcast = RepeatLoad;
7509 if (RepeatSize > ScalarSize) {
7510 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7511 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7512 } else {
7513 if (!Subtarget.hasAVX2() &&
7515 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7516 Subtarget,
7517 /*AssumeSingleUse=*/true))
7518 return SDValue();
7519 Broadcast =
7520 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7521 }
7522 return DAG.getBitcast(VT, Broadcast);
7523 }
7524 }
7525 }
7526 }
7527
7528 return SDValue();
7529}
7530
7531// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7532// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7533// are consecutive, non-overlapping, and in the right order.
7535 SelectionDAG &DAG,
7536 const X86Subtarget &Subtarget,
7537 bool IsAfterLegalize) {
7539 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7540 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7541 Elts.push_back(Elt);
7542 continue;
7543 }
7544 return SDValue();
7545 }
7546 assert(Elts.size() == VT.getVectorNumElements());
7547 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7548 IsAfterLegalize);
7549}
7550
7552 const APInt &Undefs, LLVMContext &C) {
7553 unsigned ScalarSize = VT.getScalarSizeInBits();
7554 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7555
7556 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7557 if (VT.isFloatingPoint()) {
7558 if (ScalarSize == 16)
7559 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7560 if (ScalarSize == 32)
7561 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7562 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7563 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7564 }
7565 return Constant::getIntegerValue(Ty, Val);
7566 };
7567
7568 SmallVector<Constant *, 32> ConstantVec;
7569 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7570 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7571 : getConstantScalar(Bits[I]));
7572
7573 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7574}
7575
7576static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7577 unsigned SplatBitSize, LLVMContext &C) {
7578 unsigned ScalarSize = VT.getScalarSizeInBits();
7579
7580 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7581 if (VT.isFloatingPoint()) {
7582 if (ScalarSize == 16)
7583 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7584 if (ScalarSize == 32)
7585 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7586 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7587 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7588 }
7589 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7590 };
7591
7592 if (ScalarSize == SplatBitSize)
7593 return getConstantScalar(SplatValue);
7594
7595 unsigned NumElm = SplatBitSize / ScalarSize;
7596 SmallVector<Constant *, 32> ConstantVec;
7597 for (unsigned I = 0; I != NumElm; ++I) {
7598 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7599 ConstantVec.push_back(getConstantScalar(Val));
7600 }
7601 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7602}
7603
7605 for (auto *U : N->users()) {
7606 unsigned Opc = U->getOpcode();
7607 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7608 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7609 return false;
7610 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7611 return false;
7612 if (isTargetShuffle(Opc))
7613 return true;
7614 if (Opc == ISD::BITCAST) // Ignore bitcasts
7615 return isFoldableUseOfShuffle(U);
7616 if (N->hasOneUse()) {
7617 // TODO, there may be some general way to know if a SDNode can
7618 // be folded. We now only know whether an MI is foldable.
7619 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7620 return false;
7621 return true;
7622 }
7623 }
7624 return false;
7625}
7626
7627// If the node has a single use by a VSELECT then AVX512 targets may be able to
7628// fold as a predicated instruction.
7629static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7630 unsigned SizeInBits = V.getValueSizeInBits();
7631 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7632 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7633 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7634 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7635 return true;
7636 }
7637 }
7638 return false;
7639}
7640
7641/// Attempt to use the vbroadcast instruction to generate a splat value
7642/// from a splat BUILD_VECTOR which uses:
7643/// a. A single scalar load, or a constant.
7644/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7645///
7646/// The VBROADCAST node is returned when a pattern is found,
7647/// or SDValue() otherwise.
7649 const SDLoc &dl,
7650 const X86Subtarget &Subtarget,
7651 SelectionDAG &DAG) {
7652 // VBROADCAST requires AVX.
7653 // TODO: Splats could be generated for non-AVX CPUs using SSE
7654 // instructions, but there's less potential gain for only 128-bit vectors.
7655 if (!Subtarget.hasAVX())
7656 return SDValue();
7657
7658 MVT VT = BVOp->getSimpleValueType(0);
7659 unsigned NumElts = VT.getVectorNumElements();
7660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7661 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7662 "Unsupported vector type for broadcast.");
7663
7664 // See if the build vector is a repeating sequence of scalars (inc. splat).
7665 SDValue Ld;
7666 BitVector UndefElements;
7667 SmallVector<SDValue, 16> Sequence;
7668 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7669 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7670 if (Sequence.size() == 1)
7671 Ld = Sequence[0];
7672 }
7673
7674 // Attempt to use VBROADCASTM
7675 // From this pattern:
7676 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7677 // b. t1 = (build_vector t0 t0)
7678 //
7679 // Create (VBROADCASTM v2i1 X)
7680 if (!Sequence.empty() && Subtarget.hasCDI()) {
7681 // If not a splat, are the upper sequence values zeroable?
7682 unsigned SeqLen = Sequence.size();
7683 bool UpperZeroOrUndef =
7684 SeqLen == 1 ||
7685 llvm::all_of(ArrayRef(Sequence).drop_front(),
7686 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7687 SDValue Op0 = Sequence[0];
7688 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7689 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7690 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7691 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7692 ? Op0.getOperand(0)
7693 : Op0.getOperand(0).getOperand(0);
7694 MVT MaskVT = BOperand.getSimpleValueType();
7695 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7696 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7697 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7698 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7699 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7700 unsigned Scale = 512 / VT.getSizeInBits();
7701 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7702 }
7703 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7704 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7705 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7706 return DAG.getBitcast(VT, Bcst);
7707 }
7708 }
7709 }
7710
7711 unsigned NumUndefElts = UndefElements.count();
7712 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7713 APInt SplatValue, Undef;
7714 unsigned SplatBitSize;
7715 bool HasUndef;
7716 // Check if this is a repeated constant pattern suitable for broadcasting.
7717 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7718 SplatBitSize > VT.getScalarSizeInBits() &&
7719 SplatBitSize < VT.getSizeInBits()) {
7720 // Avoid replacing with broadcast when it's a use of a shuffle
7721 // instruction to preserve the present custom lowering of shuffles.
7722 if (isFoldableUseOfShuffle(BVOp))
7723 return SDValue();
7724 // replace BUILD_VECTOR with broadcast of the repeated constants.
7725 LLVMContext *Ctx = DAG.getContext();
7726 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7727 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7728 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7729 // Load the constant scalar/subvector and broadcast it.
7730 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7731 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7732 SDValue CP = DAG.getConstantPool(C, PVT);
7733 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7734
7735 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7736 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7737 SDValue Ops[] = {DAG.getEntryNode(), CP};
7738 MachinePointerInfo MPI =
7740 SDValue Brdcst =
7741 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7742 MPI, Alignment, MachineMemOperand::MOLoad);
7743 return DAG.getBitcast(VT, Brdcst);
7744 }
7745 if (SplatBitSize > 64) {
7746 // Load the vector of constants and broadcast it.
7747 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7748 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7749 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7750 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7751 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7752 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7753 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7754 MachinePointerInfo MPI =
7757 Ops, VVT, MPI, Alignment,
7759 }
7760 }
7761
7762 // If we are moving a scalar into a vector (Ld must be set and all elements
7763 // but 1 are undef) and that operation is not obviously supported by
7764 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7765 // That's better than general shuffling and may eliminate a load to GPR and
7766 // move from scalar to vector register.
7767 if (!Ld || NumElts - NumUndefElts != 1)
7768 return SDValue();
7769 unsigned ScalarSize = Ld.getValueSizeInBits();
7770 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7771 return SDValue();
7772 }
7773
7774 bool ConstSplatVal =
7775 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7776 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7777
7778 // TODO: Handle broadcasts of non-constant sequences.
7779
7780 // Make sure that all of the users of a non-constant load are from the
7781 // BUILD_VECTOR node.
7782 // FIXME: Is the use count needed for non-constant, non-load case?
7783 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7784 return SDValue();
7785
7786 unsigned ScalarSize = Ld.getValueSizeInBits();
7787 bool IsGE256 = (VT.getSizeInBits() >= 256);
7788
7789 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7790 // instruction to save 8 or more bytes of constant pool data.
7791 // TODO: If multiple splats are generated to load the same constant,
7792 // it may be detrimental to overall size. There needs to be a way to detect
7793 // that condition to know if this is truly a size win.
7794 bool OptForSize = DAG.shouldOptForSize();
7795
7796 // Handle broadcasting a single constant scalar from the constant pool
7797 // into a vector.
7798 // On Sandybridge (no AVX2), it is still better to load a constant vector
7799 // from the constant pool and not to broadcast it from a scalar.
7800 // But override that restriction when optimizing for size.
7801 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7802 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7803 EVT CVT = Ld.getValueType();
7804 assert(!CVT.isVector() && "Must not broadcast a vector type");
7805
7806 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7807 // For size optimization, also splat v2f64 and v2i64, and for size opt
7808 // with AVX2, also splat i8 and i16.
7809 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7810 if (ScalarSize == 32 ||
7811 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7812 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7813 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7814 const Constant *C = nullptr;
7815 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7816 C = CI->getConstantIntValue();
7817 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7818 C = CF->getConstantFPValue();
7819
7820 assert(C && "Invalid constant type");
7821
7822 SDValue CP =
7824 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7825
7826 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7827 SDValue Ops[] = {DAG.getEntryNode(), CP};
7828 MachinePointerInfo MPI =
7830 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7831 MPI, Alignment, MachineMemOperand::MOLoad);
7832 }
7833 }
7834
7835 // Handle AVX2 in-register broadcasts.
7836 if (!IsLoad && Subtarget.hasInt256() &&
7837 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7838 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7839
7840 // The scalar source must be a normal load.
7841 if (!IsLoad)
7842 return SDValue();
7843
7844 // Make sure the non-chain result is only used by this build vector.
7845 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7846 return SDValue();
7847
7848 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7849 (Subtarget.hasVLX() && ScalarSize == 64)) {
7850 auto *LN = cast<LoadSDNode>(Ld);
7851 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7852 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7853 SDValue BCast =
7855 LN->getMemoryVT(), LN->getMemOperand());
7856 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7857 return BCast;
7858 }
7859
7860 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7861 // double since there is no vbroadcastsd xmm
7862 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7863 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7864 auto *LN = cast<LoadSDNode>(Ld);
7865 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7866 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7867 SDValue BCast =
7869 LN->getMemoryVT(), LN->getMemOperand());
7870 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7871 return BCast;
7872 }
7873
7874 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7875 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7876
7877 // Unsupported broadcast.
7878 return SDValue();
7879}
7880
7881/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7882/// underlying vector and index.
7883///
7884/// Modifies \p ExtractedFromVec to the real vector and returns the real
7885/// index.
7886static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7887 SDValue ExtIdx) {
7888 int Idx = ExtIdx->getAsZExtVal();
7889 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7890 return Idx;
7891
7892 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7893 // lowered this:
7894 // (extract_vector_elt (v8f32 %1), Constant<6>)
7895 // to:
7896 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7897 // (extract_subvector (v8f32 %0), Constant<4>),
7898 // undef)
7899 // Constant<0>)
7900 // In this case the vector is the extract_subvector expression and the index
7901 // is 2, as specified by the shuffle.
7902 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7903 SDValue ShuffleVec = SVOp->getOperand(0);
7904 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7905 assert(ShuffleVecVT.getVectorElementType() ==
7906 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7907
7908 int ShuffleIdx = SVOp->getMaskElt(Idx);
7909 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7910 ExtractedFromVec = ShuffleVec;
7911 return ShuffleIdx;
7912 }
7913 return Idx;
7914}
7915
7917 SelectionDAG &DAG) {
7918 MVT VT = Op.getSimpleValueType();
7919
7920 // Skip if insert_vec_elt is not supported.
7921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7923 return SDValue();
7924
7925 unsigned NumElems = Op.getNumOperands();
7926 SDValue VecIn1;
7927 SDValue VecIn2;
7928 SmallVector<unsigned, 4> InsertIndices;
7929 SmallVector<int, 8> Mask(NumElems, -1);
7930
7931 for (unsigned i = 0; i != NumElems; ++i) {
7932 unsigned Opc = Op.getOperand(i).getOpcode();
7933
7934 if (Opc == ISD::UNDEF)
7935 continue;
7936
7938 // Quit if more than 1 elements need inserting.
7939 if (InsertIndices.size() > 1)
7940 return SDValue();
7941
7942 InsertIndices.push_back(i);
7943 continue;
7944 }
7945
7946 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7947 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7948
7949 // Quit if non-constant index.
7950 if (!isa<ConstantSDNode>(ExtIdx))
7951 return SDValue();
7952 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7953
7954 // Quit if extracted from vector of different type.
7955 if (ExtractedFromVec.getValueType() != VT)
7956 return SDValue();
7957
7958 if (!VecIn1.getNode())
7959 VecIn1 = ExtractedFromVec;
7960 else if (VecIn1 != ExtractedFromVec) {
7961 if (!VecIn2.getNode())
7962 VecIn2 = ExtractedFromVec;
7963 else if (VecIn2 != ExtractedFromVec)
7964 // Quit if more than 2 vectors to shuffle
7965 return SDValue();
7966 }
7967
7968 if (ExtractedFromVec == VecIn1)
7969 Mask[i] = Idx;
7970 else if (ExtractedFromVec == VecIn2)
7971 Mask[i] = Idx + NumElems;
7972 }
7973
7974 if (!VecIn1.getNode())
7975 return SDValue();
7976
7977 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7978 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7979
7980 for (unsigned Idx : InsertIndices)
7981 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7983
7984 return NV;
7985}
7986
7987// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7989 const X86Subtarget &Subtarget) {
7990 MVT VT = Op.getSimpleValueType();
7991 MVT IVT =
7992 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7994 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7995 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7996 Op.getOperand(I)));
7997 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7998 return DAG.getBitcast(VT, Res);
7999}
8000
8001// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8003 SelectionDAG &DAG,
8004 const X86Subtarget &Subtarget) {
8005
8006 MVT VT = Op.getSimpleValueType();
8007 assert((VT.getVectorElementType() == MVT::i1) &&
8008 "Unexpected type in LowerBUILD_VECTORvXi1!");
8009 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8010 ISD::isBuildVectorAllOnes(Op.getNode()))
8011 return Op;
8012
8013 uint64_t Immediate = 0;
8014 SmallVector<unsigned, 16> NonConstIdx;
8015 bool IsSplat = true;
8016 bool HasConstElts = false;
8017 int SplatIdx = -1;
8018 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8019 SDValue In = Op.getOperand(idx);
8020 if (In.isUndef())
8021 continue;
8022 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8023 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8024 HasConstElts = true;
8025 } else {
8026 NonConstIdx.push_back(idx);
8027 }
8028 if (SplatIdx < 0)
8029 SplatIdx = idx;
8030 else if (In != Op.getOperand(SplatIdx))
8031 IsSplat = false;
8032 }
8033
8034 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8035 if (IsSplat) {
8036 // The build_vector allows the scalar element to be larger than the vector
8037 // element type. We need to mask it to use as a condition unless we know
8038 // the upper bits are zero.
8039 // FIXME: Use computeKnownBits instead of checking specific opcode?
8040 SDValue Cond = Op.getOperand(SplatIdx);
8041 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8042 if (Cond.getOpcode() != ISD::SETCC)
8043 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8044 DAG.getConstant(1, dl, MVT::i8));
8045
8046 // Perform the select in the scalar domain so we can use cmov.
8047 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8048 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8049 DAG.getAllOnesConstant(dl, MVT::i32),
8050 DAG.getConstant(0, dl, MVT::i32));
8051 Select = DAG.getBitcast(MVT::v32i1, Select);
8052 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8053 } else {
8054 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8055 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8056 DAG.getAllOnesConstant(dl, ImmVT),
8057 DAG.getConstant(0, dl, ImmVT));
8058 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8059 Select = DAG.getBitcast(VecVT, Select);
8060 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8061 DAG.getVectorIdxConstant(0, dl));
8062 }
8063 }
8064
8065 // insert elements one by one
8066 SDValue DstVec;
8067 if (HasConstElts) {
8068 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8069 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8070 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8071 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8072 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8073 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8074 } else {
8075 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8076 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8077 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8078 DstVec = DAG.getBitcast(VecVT, Imm);
8079 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8080 DAG.getVectorIdxConstant(0, dl));
8081 }
8082 } else
8083 DstVec = DAG.getUNDEF(VT);
8084
8085 for (unsigned InsertIdx : NonConstIdx) {
8086 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8087 Op.getOperand(InsertIdx),
8088 DAG.getVectorIdxConstant(InsertIdx, dl));
8089 }
8090 return DstVec;
8091}
8092
8093LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8094 switch (Opcode) {
8095 case X86ISD::PACKSS:
8096 case X86ISD::PACKUS:
8097 case X86ISD::FHADD:
8098 case X86ISD::FHSUB:
8099 case X86ISD::HADD:
8100 case X86ISD::HSUB:
8101 return true;
8102 }
8103 return false;
8104}
8105
8106/// This is a helper function of LowerToHorizontalOp().
8107/// This function checks that the build_vector \p N in input implements a
8108/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8109/// may not match the layout of an x86 256-bit horizontal instruction.
8110/// In other words, if this returns true, then some extraction/insertion will
8111/// be required to produce a valid horizontal instruction.
8112///
8113/// Parameter \p Opcode defines the kind of horizontal operation to match.
8114/// For example, if \p Opcode is equal to ISD::ADD, then this function
8115/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8116/// is equal to ISD::SUB, then this function checks if this is a horizontal
8117/// arithmetic sub.
8118///
8119/// This function only analyzes elements of \p N whose indices are
8120/// in range [BaseIdx, LastIdx).
8121///
8122/// TODO: This function was originally used to match both real and fake partial
8123/// horizontal operations, but the index-matching logic is incorrect for that.
8124/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8125/// code because it is only used for partial h-op matching now?
8126static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8127 const SDLoc &DL, SelectionDAG &DAG,
8128 unsigned BaseIdx, unsigned LastIdx,
8129 SDValue &V0, SDValue &V1) {
8130 EVT VT = N->getValueType(0);
8131 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8132 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8133 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8134 "Invalid Vector in input!");
8135
8136 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8137 bool CanFold = true;
8138 unsigned ExpectedVExtractIdx = BaseIdx;
8139 unsigned NumElts = LastIdx - BaseIdx;
8140 V0 = DAG.getUNDEF(VT);
8141 V1 = DAG.getUNDEF(VT);
8142
8143 // Check if N implements a horizontal binop.
8144 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8145 SDValue Op = N->getOperand(i + BaseIdx);
8146
8147 // Skip UNDEFs.
8148 if (Op->isUndef()) {
8149 // Update the expected vector extract index.
8150 if (i * 2 == NumElts)
8151 ExpectedVExtractIdx = BaseIdx;
8152 ExpectedVExtractIdx += 2;
8153 continue;
8154 }
8155
8156 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8157
8158 if (!CanFold)
8159 break;
8160
8161 SDValue Op0 = Op.getOperand(0);
8162 SDValue Op1 = Op.getOperand(1);
8163
8164 // Try to match the following pattern:
8165 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8166 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8168 Op0.getOperand(0) == Op1.getOperand(0) &&
8169 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8170 isa<ConstantSDNode>(Op1.getOperand(1)));
8171 if (!CanFold)
8172 break;
8173
8174 unsigned I0 = Op0.getConstantOperandVal(1);
8175 unsigned I1 = Op1.getConstantOperandVal(1);
8176
8177 if (i * 2 < NumElts) {
8178 if (V0.isUndef()) {
8179 V0 = Op0.getOperand(0);
8180 if (V0.getValueType() != VT)
8181 return false;
8182 }
8183 } else {
8184 if (V1.isUndef()) {
8185 V1 = Op0.getOperand(0);
8186 if (V1.getValueType() != VT)
8187 return false;
8188 }
8189 if (i * 2 == NumElts)
8190 ExpectedVExtractIdx = BaseIdx;
8191 }
8192
8193 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8194 if (I0 == ExpectedVExtractIdx)
8195 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8196 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8197 // Try to match the following dag sequence:
8198 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8199 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8200 } else
8201 CanFold = false;
8202
8203 ExpectedVExtractIdx += 2;
8204 }
8205
8206 return CanFold;
8207}
8208
8209/// Emit a sequence of two 128-bit horizontal add/sub followed by
8210/// a concat_vector.
8211///
8212/// This is a helper function of LowerToHorizontalOp().
8213/// This function expects two 256-bit vectors called V0 and V1.
8214/// At first, each vector is split into two separate 128-bit vectors.
8215/// Then, the resulting 128-bit vectors are used to implement two
8216/// horizontal binary operations.
8217///
8218/// The kind of horizontal binary operation is defined by \p X86Opcode.
8219///
8220/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8221/// the two new horizontal binop.
8222/// When Mode is set, the first horizontal binop dag node would take as input
8223/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8224/// horizontal binop dag node would take as input the lower 128-bit of V1
8225/// and the upper 128-bit of V1.
8226/// Example:
8227/// HADD V0_LO, V0_HI
8228/// HADD V1_LO, V1_HI
8229///
8230/// Otherwise, the first horizontal binop dag node takes as input the lower
8231/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8232/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8233/// Example:
8234/// HADD V0_LO, V1_LO
8235/// HADD V0_HI, V1_HI
8236///
8237/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8238/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8239/// the upper 128-bits of the result.
8240static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8241 const SDLoc &DL, SelectionDAG &DAG,
8242 unsigned X86Opcode, bool Mode,
8243 bool isUndefLO, bool isUndefHI) {
8244 MVT VT = V0.getSimpleValueType();
8245 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8246 "Invalid nodes in input!");
8247
8248 unsigned NumElts = VT.getVectorNumElements();
8249 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8250 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8251 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8252 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8253 MVT NewVT = V0_LO.getSimpleValueType();
8254
8255 SDValue LO = DAG.getUNDEF(NewVT);
8256 SDValue HI = DAG.getUNDEF(NewVT);
8257
8258 if (Mode) {
8259 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8260 if (!isUndefLO && !V0->isUndef())
8261 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8262 if (!isUndefHI && !V1->isUndef())
8263 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8264 } else {
8265 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8266 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8267 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8268
8269 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8270 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8271 }
8272
8273 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8274}
8275
8276/// Returns true iff \p BV builds a vector with the result equivalent to
8277/// the result of ADDSUB/SUBADD operation.
8278/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8279/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8280/// \p Opnd0 and \p Opnd1.
8282 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8283 SDValue &Opnd0, SDValue &Opnd1,
8284 unsigned &NumExtracts, bool &IsSubAdd,
8285 bool &HasAllowContract) {
8286 using namespace SDPatternMatch;
8287
8288 MVT VT = BV->getSimpleValueType(0);
8289 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8290 return false;
8291
8292 unsigned NumElts = VT.getVectorNumElements();
8293 SDValue InVec0 = DAG.getUNDEF(VT);
8294 SDValue InVec1 = DAG.getUNDEF(VT);
8295
8296 NumExtracts = 0;
8297 HasAllowContract = NumElts != 0;
8298
8299 // Odd-numbered elements in the input build vector are obtained from
8300 // adding/subtracting two integer/float elements.
8301 // Even-numbered elements in the input build vector are obtained from
8302 // subtracting/adding two integer/float elements.
8303 unsigned Opc[2] = {0, 0};
8304 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8305 SDValue Op = BV->getOperand(i);
8306
8307 // Skip 'undef' values.
8308 unsigned Opcode = Op.getOpcode();
8309 if (Opcode == ISD::UNDEF)
8310 continue;
8311
8312 // Early exit if we found an unexpected opcode.
8313 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8314 return false;
8315
8316 SDValue Op0 = Op.getOperand(0);
8317 SDValue Op1 = Op.getOperand(1);
8318
8319 // Try to match the following pattern:
8320 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8321 // Early exit if we cannot match that sequence.
8322 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8323 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8324 return false;
8325
8326 // We found a valid add/sub node, make sure its the same opcode as previous
8327 // elements for this parity.
8328 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8329 return false;
8330 Opc[i % 2] = Opcode;
8331
8332 // Update InVec0 and InVec1.
8333 if (InVec0.isUndef())
8334 InVec0 = Op0.getOperand(0);
8335 if (InVec1.isUndef())
8336 InVec1 = Op1.getOperand(0);
8337
8338 // Make sure that operands in input to each add/sub node always
8339 // come from a same pair of vectors.
8340 if (InVec0 != Op0.getOperand(0)) {
8341 if (Opcode == ISD::FSUB)
8342 return false;
8343
8344 // FADD is commutable. Try to commute the operands
8345 // and then test again.
8346 std::swap(Op0, Op1);
8347 if (InVec0 != Op0.getOperand(0))
8348 return false;
8349 }
8350
8351 if (InVec1 != Op1.getOperand(0))
8352 return false;
8353
8354 // Increment the number of extractions done.
8355 ++NumExtracts;
8356 HasAllowContract &= Op->getFlags().hasAllowContract();
8357 }
8358
8359 // Ensure we have found an opcode for both parities and that they are
8360 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8361 // inputs are undef.
8362 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8363 InVec0.isUndef() || InVec1.isUndef())
8364 return false;
8365
8366 IsSubAdd = Opc[0] == ISD::FADD;
8367
8368 Opnd0 = InVec0;
8369 Opnd1 = InVec1;
8370 return true;
8371}
8372
8373/// Returns true if is possible to fold MUL and an idiom that has already been
8374/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8375/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8376/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8377///
8378/// Prior to calling this function it should be known that there is some
8379/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8380/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8381/// before replacement of such SDNode with ADDSUB operation. Thus the number
8382/// of \p Opnd0 uses is expected to be equal to 2.
8383/// For example, this function may be called for the following IR:
8384/// %AB = fmul fast <2 x double> %A, %B
8385/// %Sub = fsub fast <2 x double> %AB, %C
8386/// %Add = fadd fast <2 x double> %AB, %C
8387/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8388/// <2 x i32> <i32 0, i32 3>
8389/// There is a def for %Addsub here, which potentially can be replaced by
8390/// X86ISD::ADDSUB operation:
8391/// %Addsub = X86ISD::ADDSUB %AB, %C
8392/// and such ADDSUB can further be replaced with FMADDSUB:
8393/// %Addsub = FMADDSUB %A, %B, %C.
8394///
8395/// The main reason why this method is called before the replacement of the
8396/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8397/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8398/// FMADDSUB is.
8399static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8400 SelectionDAG &DAG, SDValue &Opnd0,
8401 SDValue &Opnd1, SDValue &Opnd2,
8402 unsigned ExpectedUses,
8403 bool AllowSubAddOrAddSubContract) {
8404 if (Opnd0.getOpcode() != ISD::FMUL ||
8405 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8406 return false;
8407
8408 // FIXME: These checks must match the similar ones in
8409 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8410 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8411 // or MUL + ADDSUB to FMADDSUB.
8412 const TargetOptions &Options = DAG.getTarget().Options;
8413 bool AllowFusion =
8414 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8415 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8416 if (!AllowFusion)
8417 return false;
8418
8419 Opnd2 = Opnd1;
8420 Opnd1 = Opnd0.getOperand(1);
8421 Opnd0 = Opnd0.getOperand(0);
8422
8423 return true;
8424}
8425
8426/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8427/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8428/// X86ISD::FMSUBADD node.
8430 const SDLoc &DL,
8431 const X86Subtarget &Subtarget,
8432 SelectionDAG &DAG) {
8433 SDValue Opnd0, Opnd1;
8434 unsigned NumExtracts;
8435 bool IsSubAdd;
8436 bool HasAllowContract;
8437 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8438 HasAllowContract))
8439 return SDValue();
8440
8441 MVT VT = BV->getSimpleValueType(0);
8442
8443 // Try to generate X86ISD::FMADDSUB node here.
8444 SDValue Opnd2;
8445 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8446 HasAllowContract)) {
8447 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8448 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8449 }
8450
8451 // We only support ADDSUB.
8452 if (IsSubAdd)
8453 return SDValue();
8454
8455 // There are no known X86 targets with 512-bit ADDSUB instructions!
8456 // Convert to blend(fsub,fadd).
8457 if (VT.is512BitVector()) {
8458 SmallVector<int> Mask;
8459 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8460 Mask.push_back(I);
8461 Mask.push_back(I + E + 1);
8462 }
8463 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8464 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8465 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8466 }
8467
8468 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8469}
8470
8472 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8473 // Initialize outputs to known values.
8474 MVT VT = BV->getSimpleValueType(0);
8475 HOpcode = ISD::DELETED_NODE;
8476 V0 = DAG.getUNDEF(VT);
8477 V1 = DAG.getUNDEF(VT);
8478
8479 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8480 // half of the result is calculated independently from the 128-bit halves of
8481 // the inputs, so that makes the index-checking logic below more complicated.
8482 unsigned NumElts = VT.getVectorNumElements();
8483 unsigned GenericOpcode = ISD::DELETED_NODE;
8484 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8485 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8486 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8487 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8488 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8489 // Ignore undef elements.
8490 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8491 if (Op.isUndef())
8492 continue;
8493
8494 // If there's an opcode mismatch, we're done.
8495 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8496 return false;
8497
8498 // Initialize horizontal opcode.
8499 if (HOpcode == ISD::DELETED_NODE) {
8500 GenericOpcode = Op.getOpcode();
8501 switch (GenericOpcode) {
8502 // clang-format off
8503 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8504 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8505 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8506 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8507 default: return false;
8508 // clang-format on
8509 }
8510 }
8511
8512 SDValue Op0 = Op.getOperand(0);
8513 SDValue Op1 = Op.getOperand(1);
8514 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8516 Op0.getOperand(0) != Op1.getOperand(0) ||
8517 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8518 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8519 return false;
8520
8521 // The source vector is chosen based on which 64-bit half of the
8522 // destination vector is being calculated.
8523 if (j < NumEltsIn64Bits) {
8524 if (V0.isUndef())
8525 V0 = Op0.getOperand(0);
8526 } else {
8527 if (V1.isUndef())
8528 V1 = Op0.getOperand(0);
8529 }
8530
8531 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8532 if (SourceVec != Op0.getOperand(0))
8533 return false;
8534
8535 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8536 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8537 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8538 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8539 (j % NumEltsIn64Bits) * 2;
8540 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8541 continue;
8542
8543 // If this is not a commutative op, this does not match.
8544 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8545 return false;
8546
8547 // Addition is commutative, so try swapping the extract indexes.
8548 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8549 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8550 continue;
8551
8552 // Extract indexes do not match horizontal requirement.
8553 return false;
8554 }
8555 }
8556 // We matched. Opcode and operands are returned by reference as arguments.
8557 return true;
8558}
8559
8561 const SDLoc &DL, SelectionDAG &DAG,
8562 unsigned HOpcode, SDValue V0, SDValue V1) {
8563 // If either input vector is not the same size as the build vector,
8564 // extract/insert the low bits to the correct size.
8565 // This is free (examples: zmm --> xmm, xmm --> ymm).
8566 MVT VT = BV->getSimpleValueType(0);
8567 unsigned Width = VT.getSizeInBits();
8568 if (V0.getValueSizeInBits() > Width)
8569 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8570 else if (V0.getValueSizeInBits() < Width)
8571 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8572
8573 if (V1.getValueSizeInBits() > Width)
8574 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8575 else if (V1.getValueSizeInBits() < Width)
8576 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8577
8578 unsigned NumElts = VT.getVectorNumElements();
8579 APInt DemandedElts = APInt::getAllOnes(NumElts);
8580 for (unsigned i = 0; i != NumElts; ++i)
8581 if (BV->getOperand(i).isUndef())
8582 DemandedElts.clearBit(i);
8583
8584 // If we don't need the upper xmm, then perform as a xmm hop.
8585 unsigned HalfNumElts = NumElts / 2;
8586 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8587 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8588 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8589 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8590 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8591 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8592 }
8593
8594 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8595}
8596
8597/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8599 const X86Subtarget &Subtarget,
8600 SelectionDAG &DAG) {
8601 // We need at least 2 non-undef elements to make this worthwhile by default.
8602 unsigned NumNonUndefs =
8603 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8604 if (NumNonUndefs < 2)
8605 return SDValue();
8606
8607 // There are 4 sets of horizontal math operations distinguished by type:
8608 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8609 // subtarget feature. Try to match those "native" patterns first.
8610 MVT VT = BV->getSimpleValueType(0);
8611 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8612 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8613 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8614 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8615 unsigned HOpcode;
8616 SDValue V0, V1;
8617 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8618 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8619 }
8620
8621 // Try harder to match 256-bit ops by using extract/concat.
8622 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8623 return SDValue();
8624
8625 // Count the number of UNDEF operands in the build_vector in input.
8626 unsigned NumElts = VT.getVectorNumElements();
8627 unsigned Half = NumElts / 2;
8628 unsigned NumUndefsLO = 0;
8629 unsigned NumUndefsHI = 0;
8630 for (unsigned i = 0, e = Half; i != e; ++i)
8631 if (BV->getOperand(i)->isUndef())
8632 NumUndefsLO++;
8633
8634 for (unsigned i = Half, e = NumElts; i != e; ++i)
8635 if (BV->getOperand(i)->isUndef())
8636 NumUndefsHI++;
8637
8638 SDValue InVec0, InVec1;
8639 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8640 SDValue InVec2, InVec3;
8641 unsigned X86Opcode;
8642 bool CanFold = true;
8643
8644 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8645 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8646 InVec3) &&
8647 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8648 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8649 X86Opcode = X86ISD::HADD;
8650 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8651 InVec1) &&
8652 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8653 InVec3) &&
8654 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8655 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8656 X86Opcode = X86ISD::HSUB;
8657 else
8658 CanFold = false;
8659
8660 if (CanFold) {
8661 // Do not try to expand this build_vector into a pair of horizontal
8662 // add/sub if we can emit a pair of scalar add/sub.
8663 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8664 return SDValue();
8665
8666 // Convert this build_vector into a pair of horizontal binops followed by
8667 // a concat vector. We must adjust the outputs from the partial horizontal
8668 // matching calls above to account for undefined vector halves.
8669 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8670 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8671 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8672 bool isUndefLO = NumUndefsLO == Half;
8673 bool isUndefHI = NumUndefsHI == Half;
8674 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8675 isUndefHI);
8676 }
8677 }
8678
8679 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8680 VT == MVT::v16i16) {
8681 unsigned X86Opcode;
8682 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8683 InVec1))
8684 X86Opcode = X86ISD::HADD;
8685 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8686 InVec1))
8687 X86Opcode = X86ISD::HSUB;
8688 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8689 InVec1))
8690 X86Opcode = X86ISD::FHADD;
8691 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8692 InVec1))
8693 X86Opcode = X86ISD::FHSUB;
8694 else
8695 return SDValue();
8696
8697 // Don't try to expand this build_vector into a pair of horizontal add/sub
8698 // if we can simply emit a pair of scalar add/sub.
8699 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8700 return SDValue();
8701
8702 // Convert this build_vector into two horizontal add/sub followed by
8703 // a concat vector.
8704 bool isUndefLO = NumUndefsLO == Half;
8705 bool isUndefHI = NumUndefsHI == Half;
8706 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8707 isUndefLO, isUndefHI);
8708 }
8709
8710 return SDValue();
8711}
8712
8713static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8714 SelectionDAG &DAG);
8715
8716/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8717/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8718/// just apply the bit to the vectors.
8719/// NOTE: Its not in our interest to start make a general purpose vectorizer
8720/// from this, but enough scalar bit operations are created from the later
8721/// legalization + scalarization stages to need basic support.
8723 const X86Subtarget &Subtarget,
8724 SelectionDAG &DAG) {
8725 MVT VT = Op->getSimpleValueType(0);
8726 unsigned NumElems = VT.getVectorNumElements();
8727 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8728
8729 // Check that all elements have the same opcode.
8730 // TODO: Should we allow UNDEFS and if so how many?
8731 unsigned Opcode = Op->getOperand(0).getOpcode();
8732 for (unsigned i = 1; i < NumElems; ++i)
8733 if (Opcode != Op->getOperand(i).getOpcode())
8734 return SDValue();
8735
8736 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8737 bool IsShift = false;
8738 switch (Opcode) {
8739 default:
8740 return SDValue();
8741 case ISD::SHL:
8742 case ISD::SRL:
8743 case ISD::SRA:
8744 IsShift = true;
8745 break;
8746 case ISD::AND:
8747 case ISD::XOR:
8748 case ISD::OR:
8749 // Don't do this if the buildvector is a splat - we'd replace one
8750 // constant with an entire vector.
8751 if (Op->getSplatValue())
8752 return SDValue();
8753 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8754 return SDValue();
8755 break;
8756 }
8757
8758 SmallVector<SDValue, 4> LHSElts, RHSElts;
8759 for (SDValue Elt : Op->ops()) {
8760 SDValue LHS = Elt.getOperand(0);
8761 SDValue RHS = Elt.getOperand(1);
8762
8763 // We expect the canonicalized RHS operand to be the constant.
8764 if (!isa<ConstantSDNode>(RHS))
8765 return SDValue();
8766
8767 // Extend shift amounts.
8768 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8769 if (!IsShift)
8770 return SDValue();
8771 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8772 }
8773
8774 LHSElts.push_back(LHS);
8775 RHSElts.push_back(RHS);
8776 }
8777
8778 // Limit to shifts by uniform immediates.
8779 // TODO: Only accept vXi8/vXi64 special cases?
8780 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8781 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8782 return SDValue();
8783
8784 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8785 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8786 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8787
8788 if (!IsShift)
8789 return Res;
8790
8791 // Immediately lower the shift to ensure the constant build vector doesn't
8792 // get converted to a constant pool before the shift is lowered.
8793 return LowerShift(Res, Subtarget, DAG);
8794}
8795
8796static bool isShuffleFoldableLoad(SDValue);
8797
8798/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8799/// representing a blend.
8801 X86Subtarget const &Subtarget,
8802 SelectionDAG &DAG) {
8803 MVT VT = BVOp->getSimpleValueType(0u);
8804
8805 if (VT != MVT::v4f64)
8806 return SDValue();
8807
8808 // Collect unique operands.
8809 auto UniqueOps = SmallSet<SDValue, 16u>();
8810 for (SDValue Op : BVOp->ops()) {
8811 if (isIntOrFPConstant(Op) || Op.isUndef())
8812 return SDValue();
8813 UniqueOps.insert(Op);
8814 }
8815
8816 // Candidate BUILD_VECTOR must have 2 unique operands.
8817 if (UniqueOps.size() != 2u)
8818 return SDValue();
8819
8820 SDValue Op0 = BVOp->getOperand(0u);
8821 UniqueOps.erase(Op0);
8822 SDValue Op1 = *UniqueOps.begin();
8823
8824 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8825 isShuffleFoldableLoad(Op1)) {
8826 // Create shuffle mask.
8827 auto const NumElems = VT.getVectorNumElements();
8828 SmallVector<int, 16u> Mask(NumElems);
8829 for (auto I = 0u; I < NumElems; ++I) {
8830 SDValue Op = BVOp->getOperand(I);
8831 Mask[I] = Op == Op0 ? I : I + NumElems;
8832 }
8833 // Create shuffle of splats.
8834 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8835 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8836 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8837 }
8838
8839 return SDValue();
8840}
8841
8842/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8843/// functionality to do this, so it's all zeros, all ones, or some derivation
8844/// that is cheap to calculate.
8846 SelectionDAG &DAG,
8847 const X86Subtarget &Subtarget) {
8848 MVT VT = Op.getSimpleValueType();
8849
8850 // Vectors containing all zeros can be matched by pxor and xorps.
8851 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8852 return Op;
8853
8854 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8855 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8856 // vpcmpeqd on 256-bit vectors.
8857 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8858 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8859 return Op;
8860
8861 return getOnesVector(VT, DAG, DL);
8862 }
8863
8864 return SDValue();
8865}
8866
8867/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8868/// from a vector of source values and a vector of extraction indices.
8869/// The vectors might be manipulated to match the type of the permute op.
8870static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8871 const SDLoc &DL, SelectionDAG &DAG,
8872 const X86Subtarget &Subtarget) {
8873 MVT ShuffleVT = VT;
8874 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8875 unsigned NumElts = VT.getVectorNumElements();
8876 unsigned SizeInBits = VT.getSizeInBits();
8877
8878 // Adjust IndicesVec to match VT size.
8879 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8880 "Illegal variable permute mask size");
8881 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8882 // Narrow/widen the indices vector to the correct size.
8883 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8884 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8885 NumElts * VT.getScalarSizeInBits());
8886 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8887 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8888 SDLoc(IndicesVec), SizeInBits);
8889 // Zero-extend the index elements within the vector.
8890 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8891 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8892 IndicesVT, IndicesVec);
8893 }
8894 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8895
8896 // Handle SrcVec that don't match VT type.
8897 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8898 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8899 // Handle larger SrcVec by treating it as a larger permute.
8900 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8901 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8902 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8903 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8904 Subtarget, DAG, SDLoc(IndicesVec));
8905 SDValue NewSrcVec =
8906 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8907 if (NewSrcVec)
8908 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8909 return SDValue();
8910 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8911 // Widen smaller SrcVec to match VT.
8912 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8913 } else
8914 return SDValue();
8915 }
8916
8917 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8918 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8919 EVT SrcVT = Idx.getValueType();
8920 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8921 uint64_t IndexScale = 0;
8922 uint64_t IndexOffset = 0;
8923
8924 // If we're scaling a smaller permute op, then we need to repeat the
8925 // indices, scaling and offsetting them as well.
8926 // e.g. v4i32 -> v16i8 (Scale = 4)
8927 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8928 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8929 for (uint64_t i = 0; i != Scale; ++i) {
8930 IndexScale |= Scale << (i * NumDstBits);
8931 IndexOffset |= i << (i * NumDstBits);
8932 }
8933
8934 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8935 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8936 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8937 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8938 return Idx;
8939 };
8940
8941 unsigned Opcode = 0;
8942 switch (VT.SimpleTy) {
8943 default:
8944 break;
8945 case MVT::v16i8:
8946 if (Subtarget.hasSSSE3())
8947 Opcode = X86ISD::PSHUFB;
8948 break;
8949 case MVT::v8i16:
8950 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8951 Opcode = X86ISD::VPERMV;
8952 else if (Subtarget.hasSSSE3()) {
8953 Opcode = X86ISD::PSHUFB;
8954 ShuffleVT = MVT::v16i8;
8955 }
8956 break;
8957 case MVT::v4f32:
8958 case MVT::v4i32:
8959 if (Subtarget.hasAVX()) {
8960 Opcode = X86ISD::VPERMILPV;
8961 ShuffleVT = MVT::v4f32;
8962 } else if (Subtarget.hasSSSE3()) {
8963 Opcode = X86ISD::PSHUFB;
8964 ShuffleVT = MVT::v16i8;
8965 }
8966 break;
8967 case MVT::v2f64:
8968 case MVT::v2i64:
8969 if (Subtarget.hasAVX()) {
8970 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8971 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8972 Opcode = X86ISD::VPERMILPV;
8973 ShuffleVT = MVT::v2f64;
8974 } else if (Subtarget.hasSSE41()) {
8975 // SSE41 can compare v2i64 - select between indices 0 and 1.
8976 return DAG.getSelectCC(
8977 DL, IndicesVec,
8978 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8979 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8980 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8982 }
8983 break;
8984 case MVT::v32i8:
8985 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8986 Opcode = X86ISD::VPERMV;
8987 else if (Subtarget.hasXOP()) {
8988 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8989 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8990 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8991 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8992 return DAG.getNode(
8994 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8995 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8996 } else if (Subtarget.hasAVX()) {
8997 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8998 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8999 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9000 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9001 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9002 ArrayRef<SDValue> Ops) {
9003 // Permute Lo and Hi and then select based on index range.
9004 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9005 // care about the bit[7] as its just an index vector.
9006 SDValue Idx = Ops[2];
9007 EVT VT = Idx.getValueType();
9008 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9009 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9010 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9012 };
9013 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9014 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9015 PSHUFBBuilder);
9016 }
9017 break;
9018 case MVT::v16i16:
9019 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9020 Opcode = X86ISD::VPERMV;
9021 else if (Subtarget.hasAVX()) {
9022 // Scale to v32i8 and perform as v32i8.
9023 IndicesVec = ScaleIndices(IndicesVec, 2);
9024 return DAG.getBitcast(
9026 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9027 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9028 }
9029 break;
9030 case MVT::v8f32:
9031 case MVT::v8i32:
9032 if (Subtarget.hasAVX2())
9033 Opcode = X86ISD::VPERMV;
9034 else if (Subtarget.hasAVX()) {
9035 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9036 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9037 {0, 1, 2, 3, 0, 1, 2, 3});
9038 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9039 {4, 5, 6, 7, 4, 5, 6, 7});
9040 if (Subtarget.hasXOP())
9041 return DAG.getBitcast(
9042 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9043 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9044 // Permute Lo and Hi and then select based on index range.
9045 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9046 SDValue Res = DAG.getSelectCC(
9047 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9048 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9049 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9051 return DAG.getBitcast(VT, Res);
9052 }
9053 break;
9054 case MVT::v4i64:
9055 case MVT::v4f64:
9056 if (Subtarget.hasAVX512()) {
9057 if (!Subtarget.hasVLX()) {
9058 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9059 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9060 SDLoc(SrcVec));
9061 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9062 DAG, SDLoc(IndicesVec));
9063 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9064 DAG, Subtarget);
9065 return extract256BitVector(Res, 0, DAG, DL);
9066 }
9067 Opcode = X86ISD::VPERMV;
9068 } else if (Subtarget.hasAVX()) {
9069 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9070 SDValue LoLo =
9071 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9072 SDValue HiHi =
9073 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9074 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9075 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9076 if (Subtarget.hasXOP())
9077 return DAG.getBitcast(
9078 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9079 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9080 // Permute Lo and Hi and then select based on index range.
9081 // This works as VPERMILPD only uses index bit[1] to permute elements.
9082 SDValue Res = DAG.getSelectCC(
9083 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9084 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9085 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9087 return DAG.getBitcast(VT, Res);
9088 }
9089 break;
9090 case MVT::v64i8:
9091 if (Subtarget.hasVBMI())
9092 Opcode = X86ISD::VPERMV;
9093 break;
9094 case MVT::v32i16:
9095 if (Subtarget.hasBWI())
9096 Opcode = X86ISD::VPERMV;
9097 break;
9098 case MVT::v16f32:
9099 case MVT::v16i32:
9100 case MVT::v8f64:
9101 case MVT::v8i64:
9102 if (Subtarget.hasAVX512())
9103 Opcode = X86ISD::VPERMV;
9104 break;
9105 }
9106 if (!Opcode)
9107 return SDValue();
9108
9109 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9110 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9111 "Illegal variable permute shuffle type");
9112
9113 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9114 if (Scale > 1)
9115 IndicesVec = ScaleIndices(IndicesVec, Scale);
9116
9117 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9118 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9119
9120 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9121 SDValue Res = Opcode == X86ISD::VPERMV
9122 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9123 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9124 return DAG.getBitcast(VT, Res);
9125}
9126
9127// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9128// reasoned to be a permutation of a vector by indices in a non-constant vector.
9129// (build_vector (extract_elt V, (extract_elt I, 0)),
9130// (extract_elt V, (extract_elt I, 1)),
9131// ...
9132// ->
9133// (vpermv I, V)
9134//
9135// TODO: Handle undefs
9136// TODO: Utilize pshufb and zero mask blending to support more efficient
9137// construction of vectors with constant-0 elements.
9138static SDValue
9140 SelectionDAG &DAG,
9141 const X86Subtarget &Subtarget) {
9142 SDValue SrcVec, IndicesVec;
9143
9144 auto PeekThroughFreeze = [](SDValue N) {
9145 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9146 return N->getOperand(0);
9147 return N;
9148 };
9149 // Check for a match of the permute source vector and permute index elements.
9150 // This is done by checking that the i-th build_vector operand is of the form:
9151 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9152 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9153 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9154 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9155 return SDValue();
9156
9157 // If this is the first extract encountered in V, set the source vector,
9158 // otherwise verify the extract is from the previously defined source
9159 // vector.
9160 if (!SrcVec)
9161 SrcVec = Op.getOperand(0);
9162 else if (SrcVec != Op.getOperand(0))
9163 return SDValue();
9164 SDValue ExtractedIndex = Op->getOperand(1);
9165 // Peek through extends.
9166 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9167 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9168 ExtractedIndex = ExtractedIndex.getOperand(0);
9169 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9170 return SDValue();
9171
9172 // If this is the first extract from the index vector candidate, set the
9173 // indices vector, otherwise verify the extract is from the previously
9174 // defined indices vector.
9175 if (!IndicesVec)
9176 IndicesVec = ExtractedIndex.getOperand(0);
9177 else if (IndicesVec != ExtractedIndex.getOperand(0))
9178 return SDValue();
9179
9180 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9181 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9182 return SDValue();
9183 }
9184
9185 MVT VT = V.getSimpleValueType();
9186 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9187}
9188
9189SDValue
9190X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9191 SDLoc dl(Op);
9192
9193 MVT VT = Op.getSimpleValueType();
9194 MVT EltVT = VT.getVectorElementType();
9195 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9196 unsigned NumElems = Op.getNumOperands();
9197
9198 // Generate vectors for predicate vectors.
9199 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9200 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9201
9202 if (VT.getVectorElementType() == MVT::bf16 &&
9203 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9204 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9205
9206 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9207 return VectorCst;
9208
9209 unsigned EVTBits = EltVT.getSizeInBits();
9210 APInt UndefMask = APInt::getZero(NumElems);
9211 APInt FrozenUndefMask = APInt::getZero(NumElems);
9212 APInt ZeroMask = APInt::getZero(NumElems);
9213 APInt NonZeroMask = APInt::getZero(NumElems);
9214 bool IsAllConstants = true;
9215 bool OneUseFrozenUndefs = true;
9216 SmallSet<SDValue, 8> Values;
9217 unsigned NumConstants = NumElems;
9218 for (unsigned i = 0; i < NumElems; ++i) {
9219 SDValue Elt = Op.getOperand(i);
9220 if (Elt.isUndef()) {
9221 UndefMask.setBit(i);
9222 continue;
9223 }
9224 if (ISD::isFreezeUndef(Elt.getNode())) {
9225 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9226 FrozenUndefMask.setBit(i);
9227 continue;
9228 }
9229 Values.insert(Elt);
9230 if (!isIntOrFPConstant(Elt)) {
9231 IsAllConstants = false;
9232 NumConstants--;
9233 }
9234 if (X86::isZeroNode(Elt)) {
9235 ZeroMask.setBit(i);
9236 } else {
9237 NonZeroMask.setBit(i);
9238 }
9239 }
9240
9241 // All undef vector. Return an UNDEF.
9242 if (UndefMask.isAllOnes())
9243 return DAG.getUNDEF(VT);
9244
9245 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9246 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9247 return DAG.getFreeze(DAG.getUNDEF(VT));
9248
9249 // All undef/freeze(undef)/zero vector. Return a zero vector.
9250 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9251 return getZeroVector(VT, Subtarget, DAG, dl);
9252
9253 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9254 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9255 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9256 // and blend the FREEZE-UNDEF operands back in.
9257 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9258 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9259 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9260 SmallVector<int, 16> BlendMask(NumElems, -1);
9261 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9262 for (unsigned i = 0; i < NumElems; ++i) {
9263 if (UndefMask[i]) {
9264 BlendMask[i] = -1;
9265 continue;
9266 }
9267 BlendMask[i] = i;
9268 if (!FrozenUndefMask[i])
9269 Elts[i] = Op.getOperand(i);
9270 else
9271 BlendMask[i] += NumElems;
9272 }
9273 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9274 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9275 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9276 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9277 }
9278
9279 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9280
9281 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9282 // be better off lowering to a smaller build vector and padding with
9283 // undef/zero.
9284 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9286 unsigned UpperElems = NumElems / 2;
9287 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9288 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9289 if (NumUpperUndefsOrZeros >= UpperElems) {
9290 if (VT.is512BitVector() &&
9291 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9292 UpperElems = NumElems - (NumElems / 4);
9293 // If freeze(undef) is in any upper elements, force to zero.
9294 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9295 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9296 SDValue NewBV =
9297 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9298 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9299 }
9300 }
9301
9302 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9303 return AddSub;
9304 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9305 return HorizontalOp;
9306 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9307 return Broadcast;
9308 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9309 return BitOp;
9310 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9311 return Blend;
9312
9313 unsigned NumZero = ZeroMask.popcount();
9314 unsigned NumNonZero = NonZeroMask.popcount();
9315
9316 // If we are inserting one variable into a vector of non-zero constants, try
9317 // to avoid loading each constant element as a scalar. Load the constants as a
9318 // vector and then insert the variable scalar element. If insertion is not
9319 // supported, fall back to a shuffle to get the scalar blended with the
9320 // constants. Insertion into a zero vector is handled as a special-case
9321 // somewhere below here.
9322 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9323 FrozenUndefMask.isZero() &&
9326 // Create an all-constant vector. The variable element in the old
9327 // build vector is replaced by undef in the constant vector. Save the
9328 // variable scalar element and its index for use in the insertelement.
9329 LLVMContext &Context = *DAG.getContext();
9330 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9331 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9332 SDValue VarElt;
9333 SDValue InsIndex;
9334 for (unsigned i = 0; i != NumElems; ++i) {
9335 SDValue Elt = Op.getOperand(i);
9336 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9337 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9338 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9339 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9340 else if (!Elt.isUndef()) {
9341 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9342 "Expected one variable element in this vector");
9343 VarElt = Elt;
9344 InsIndex = DAG.getVectorIdxConstant(i, dl);
9345 }
9346 }
9347 Constant *CV = ConstantVector::get(ConstVecOps);
9348 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9349
9350 // The constants we just created may not be legal (eg, floating point). We
9351 // must lower the vector right here because we can not guarantee that we'll
9352 // legalize it before loading it. This is also why we could not just create
9353 // a new build vector here. If the build vector contains illegal constants,
9354 // it could get split back up into a series of insert elements.
9355 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9356 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9359 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9360 unsigned InsertC = InsIndex->getAsZExtVal();
9361 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9362 if (InsertC < NumEltsInLow128Bits)
9363 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9364
9365 // There's no good way to insert into the high elements of a >128-bit
9366 // vector, so use shuffles to avoid an extract/insert sequence.
9367 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9368 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9369 SmallVector<int, 8> ShuffleMask;
9370 unsigned NumElts = VT.getVectorNumElements();
9371 for (unsigned i = 0; i != NumElts; ++i)
9372 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9373 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9374 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9375 }
9376
9377 // Special case for single non-zero, non-undef, element.
9378 if (NumNonZero == 1) {
9379 unsigned Idx = NonZeroMask.countr_zero();
9380 SDValue Item = Op.getOperand(Idx);
9381
9382 // If we have a constant or non-constant insertion into the low element of
9383 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9384 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9385 // depending on what the source datatype is.
9386 if (Idx == 0) {
9387 if (NumZero == 0)
9388 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9389
9390 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9391 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9392 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9393 assert((VT.is128BitVector() || VT.is256BitVector() ||
9394 VT.is512BitVector()) &&
9395 "Expected an SSE value type!");
9396 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9397 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9398 // zero vector.
9399 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9400 }
9401
9402 // We can't directly insert an i8 or i16 into a vector, so zero extend
9403 // it to i32 first.
9404 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9405 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9406 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9407 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9408 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9409 return DAG.getBitcast(VT, Item);
9410 }
9411 }
9412
9413 // Is it a vector logical left shift?
9414 if (NumElems == 2 && Idx == 1 &&
9415 X86::isZeroNode(Op.getOperand(0)) &&
9416 !X86::isZeroNode(Op.getOperand(1))) {
9417 unsigned NumBits = VT.getSizeInBits();
9418 return getVShift(true, VT,
9420 VT, Op.getOperand(1)),
9421 NumBits/2, DAG, *this, dl);
9422 }
9423
9424 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9425 return SDValue();
9426
9427 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9428 // is a non-constant being inserted into an element other than the low one,
9429 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9430 // movd/movss) to move this into the low element, then shuffle it into
9431 // place.
9432 if (EVTBits == 32) {
9433 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9434 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9435 }
9436 }
9437
9438 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9439 if (Values.size() == 1) {
9440 if (EVTBits == 32) {
9441 // Instead of a shuffle like this:
9442 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9443 // Check if it's possible to issue this instead.
9444 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9445 unsigned Idx = NonZeroMask.countr_zero();
9446 SDValue Item = Op.getOperand(Idx);
9447 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9448 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9449 }
9450 return SDValue();
9451 }
9452
9453 // A vector full of immediates; various special cases are already
9454 // handled, so this is best done with a single constant-pool load.
9455 if (IsAllConstants)
9456 return SDValue();
9457
9458 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9459 return V;
9460
9461 // See if we can use a vector load to get all of the elements.
9462 {
9463 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9464 if (SDValue LD =
9465 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9466 return LD;
9467 }
9468
9469 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9470 // build_vector and broadcast it.
9471 // TODO: We could probably generalize this more.
9472 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9473 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9474 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9475 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9476 // Make sure all the even/odd operands match.
9477 for (unsigned i = 2; i != NumElems; ++i)
9478 if (Ops[i % 2] != Op.getOperand(i))
9479 return false;
9480 return true;
9481 };
9482 if (CanSplat(Op, NumElems, Ops)) {
9483 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9484 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9485 // Create a new build vector and cast to v2i64/v2f64.
9486 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9487 DAG.getBuildVector(NarrowVT, dl, Ops));
9488 // Broadcast from v2i64/v2f64 and cast to final VT.
9489 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9490 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9491 NewBV));
9492 }
9493 }
9494
9495 // For AVX-length vectors, build the individual 128-bit pieces and use
9496 // shuffles to put them in place.
9497 if (VT.getSizeInBits() > 128) {
9498 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9499
9500 // Build both the lower and upper subvector.
9501 SDValue Lower =
9502 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9504 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9505
9506 // Recreate the wider vector with the lower and upper part.
9507 return concatSubVectors(Lower, Upper, DAG, dl);
9508 }
9509
9510 // Let legalizer expand 2-wide build_vectors.
9511 if (EVTBits == 64) {
9512 if (NumNonZero == 1) {
9513 // One half is zero or undef.
9514 unsigned Idx = NonZeroMask.countr_zero();
9515 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9516 Op.getOperand(Idx));
9517 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9518 }
9519 return SDValue();
9520 }
9521
9522 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9523 if (EVTBits == 8 && NumElems == 16)
9524 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9525 NumZero, DAG, Subtarget))
9526 return V;
9527
9528 if (EltVT == MVT::i16 && NumElems == 8)
9529 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9530 NumZero, DAG, Subtarget))
9531 return V;
9532
9533 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9534 if (EVTBits == 32 && NumElems == 4)
9535 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9536 return V;
9537
9538 // If element VT is == 32 bits, turn it into a number of shuffles.
9539 if (NumElems == 4 && NumZero > 0) {
9540 SmallVector<SDValue, 8> Ops(NumElems);
9541 for (unsigned i = 0; i < 4; ++i) {
9542 bool isZero = !NonZeroMask[i];
9543 if (isZero)
9544 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9545 else
9546 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9547 }
9548
9549 for (unsigned i = 0; i < 2; ++i) {
9550 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9551 default: llvm_unreachable("Unexpected NonZero count");
9552 case 0:
9553 Ops[i] = Ops[i*2]; // Must be a zero vector.
9554 break;
9555 case 1:
9556 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9557 break;
9558 case 2:
9559 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9560 break;
9561 case 3:
9562 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9563 break;
9564 }
9565 }
9566
9567 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9568 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9569 int MaskVec[] = {
9570 Reverse1 ? 1 : 0,
9571 Reverse1 ? 0 : 1,
9572 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9573 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9574 };
9575 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9576 }
9577
9578 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9579
9580 // Check for a build vector from mostly shuffle plus few inserting.
9581 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9582 return Sh;
9583
9584 // For SSE 4.1, use insertps to put the high elements into the low element.
9585 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9587 if (!Op.getOperand(0).isUndef())
9588 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9589 else
9590 Result = DAG.getUNDEF(VT);
9591
9592 for (unsigned i = 1; i < NumElems; ++i) {
9593 if (Op.getOperand(i).isUndef()) continue;
9594 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9595 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9596 }
9597 return Result;
9598 }
9599
9600 // Otherwise, expand into a number of unpckl*, start by extending each of
9601 // our (non-undef) elements to the full vector width with the element in the
9602 // bottom slot of the vector (which generates no code for SSE).
9603 SmallVector<SDValue, 8> Ops(NumElems);
9604 for (unsigned i = 0; i < NumElems; ++i) {
9605 if (!Op.getOperand(i).isUndef())
9606 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9607 else
9608 Ops[i] = DAG.getUNDEF(VT);
9609 }
9610
9611 // Next, we iteratively mix elements, e.g. for v4f32:
9612 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9613 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9614 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9615 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9616 // Generate scaled UNPCKL shuffle mask.
9618 for(unsigned i = 0; i != Scale; ++i)
9619 Mask.push_back(i);
9620 for (unsigned i = 0; i != Scale; ++i)
9621 Mask.push_back(NumElems+i);
9622 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9623
9624 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9625 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9626 }
9627 return Ops[0];
9628}
9629
9630// 256-bit AVX can use the vinsertf128 instruction
9631// to create 256-bit vectors from two other 128-bit ones.
9632// TODO: Detect subvector broadcast here instead of DAG combine?
9634 SelectionDAG &DAG,
9635 const X86Subtarget &Subtarget) {
9636 MVT ResVT = Op.getSimpleValueType();
9637 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9638 "Value type must be 256-/512-bit wide");
9639
9640 unsigned NumOperands = Op.getNumOperands();
9641 unsigned NumFreezeUndef = 0;
9642 unsigned NumZero = 0;
9643 unsigned NumNonZero = 0;
9644 unsigned NonZeros = 0;
9645 SmallSet<SDValue, 4> Undefs;
9646 for (unsigned i = 0; i != NumOperands; ++i) {
9647 SDValue SubVec = Op.getOperand(i);
9648 if (SubVec.isUndef())
9649 continue;
9650 if (ISD::isFreezeUndef(SubVec.getNode())) {
9651 // If the freeze(undef) has multiple uses then we must fold to zero.
9652 if (SubVec.hasOneUse()) {
9653 ++NumFreezeUndef;
9654 } else {
9655 ++NumZero;
9656 Undefs.insert(SubVec);
9657 }
9658 }
9659 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9660 ++NumZero;
9661 else {
9662 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9663 NonZeros |= 1 << i;
9664 ++NumNonZero;
9665 }
9666 }
9667
9668 // If we have more than 2 non-zeros, build each half separately.
9669 if (NumNonZero > 2) {
9670 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9671 ArrayRef<SDUse> Ops = Op->ops();
9672 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9673 Ops.slice(0, NumOperands/2));
9674 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9675 Ops.slice(NumOperands/2));
9676 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9677 }
9678
9679 // Otherwise, build it up through insert_subvectors.
9680 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9681 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9682 : DAG.getUNDEF(ResVT));
9683
9684 // Replace Undef operands with ZeroVector.
9685 for (SDValue U : Undefs)
9687 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9688
9689 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9690 unsigned NumSubElems = SubVT.getVectorNumElements();
9691 for (unsigned i = 0; i != NumOperands; ++i) {
9692 if ((NonZeros & (1 << i)) == 0)
9693 continue;
9694
9695 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9696 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9697 }
9698
9699 return Vec;
9700}
9701
9702// Returns true if the given node is a type promotion (by concatenating i1
9703// zeros) of the result of a node that already zeros all upper bits of
9704// k-register.
9705// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9707 const X86Subtarget &Subtarget,
9708 SelectionDAG & DAG) {
9709 MVT ResVT = Op.getSimpleValueType();
9710 unsigned NumOperands = Op.getNumOperands();
9711 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9712 "Unexpected number of operands in CONCAT_VECTORS");
9713
9714 uint64_t Zeros = 0;
9715 uint64_t NonZeros = 0;
9716 for (unsigned i = 0; i != NumOperands; ++i) {
9717 SDValue SubVec = Op.getOperand(i);
9718 if (SubVec.isUndef())
9719 continue;
9720 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9721 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9722 Zeros |= (uint64_t)1 << i;
9723 else
9724 NonZeros |= (uint64_t)1 << i;
9725 }
9726
9727 unsigned NumElems = ResVT.getVectorNumElements();
9728
9729 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9730 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9731 // insert_subvector will give us two kshifts.
9732 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9733 Log2_64(NonZeros) != NumOperands - 1) {
9734 unsigned Idx = Log2_64(NonZeros);
9735 SDValue SubVec = Op.getOperand(Idx);
9736 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9737 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9738 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9739 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9740 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9741 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9742 DAG.getVectorIdxConstant(0, dl));
9743 }
9744
9745 // If there are zero or one non-zeros we can handle this very simply.
9746 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9747 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9748 if (!NonZeros)
9749 return Vec;
9750 unsigned Idx = Log2_64(NonZeros);
9751 SDValue SubVec = Op.getOperand(Idx);
9752 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9753 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9754 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9755 }
9756
9757 if (NumOperands > 2) {
9758 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9759 ArrayRef<SDUse> Ops = Op->ops();
9760 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9761 Ops.slice(0, NumOperands / 2));
9762 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9763 Ops.slice(NumOperands / 2));
9764 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9765 }
9766
9767 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9768
9769 if (ResVT.getVectorNumElements() >= 16)
9770 return Op; // The operation is legal with KUNPCK
9771
9772 SDValue Vec =
9773 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9774 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9775 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9776 DAG.getVectorIdxConstant(NumElems / 2, dl));
9777}
9778
9780 const X86Subtarget &Subtarget,
9781 SelectionDAG &DAG) {
9782 SDLoc DL(Op);
9783 MVT VT = Op.getSimpleValueType();
9784 if (VT.getVectorElementType() == MVT::i1)
9785 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9786
9787 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9788 // from two other 128-bit ones.
9789 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9790 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9791 (VT.is512BitVector() &&
9792 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9793 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9794}
9795
9796//===----------------------------------------------------------------------===//
9797// Vector shuffle lowering
9798//
9799// This is an experimental code path for lowering vector shuffles on x86. It is
9800// designed to handle arbitrary vector shuffles and blends, gracefully
9801// degrading performance as necessary. It works hard to recognize idiomatic
9802// shuffles and lower them to optimal instruction patterns without leaving
9803// a framework that allows reasonably efficient handling of all vector shuffle
9804// patterns.
9805//===----------------------------------------------------------------------===//
9806
9807/// Checks whether the vector elements referenced by two shuffle masks are
9808/// equivalent.
9809static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9810 int Idx, int ExpectedIdx) {
9811 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9812 ExpectedIdx < MaskSize && "Out of range element index");
9813 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9814 return false;
9815
9816 EVT VT = Op.getValueType();
9817 EVT ExpectedVT = ExpectedOp.getValueType();
9818
9819 // Sources must be vectors and match the mask's element count.
9820 if (!VT.isVector() || !ExpectedVT.isVector() ||
9821 (int)VT.getVectorNumElements() != MaskSize ||
9822 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9823 return false;
9824
9825 // Exact match.
9826 if (Idx == ExpectedIdx && Op == ExpectedOp)
9827 return true;
9828
9829 switch (Op.getOpcode()) {
9830 case ISD::BUILD_VECTOR:
9831 // If the values are build vectors, we can look through them to find
9832 // equivalent inputs that make the shuffles equivalent.
9833 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9834 case ISD::BITCAST: {
9836 EVT SrcVT = Src.getValueType();
9837 if (Op == ExpectedOp && SrcVT.isVector()) {
9838 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9839 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9840 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9841 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9842 Idx / Scale, ExpectedIdx / Scale);
9843 }
9844 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9845 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9846 for (unsigned I = 0; I != Scale; ++I)
9847 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9848 (Idx * Scale) + I,
9849 (ExpectedIdx * Scale) + I))
9850 return false;
9851 return true;
9852 }
9853 }
9854 break;
9855 }
9856 case ISD::VECTOR_SHUFFLE: {
9857 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9858 return Op == ExpectedOp &&
9859 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9860 }
9861 case X86ISD::VBROADCAST:
9863 return Op == ExpectedOp;
9865 if (Op == ExpectedOp) {
9866 auto *MemOp = cast<MemSDNode>(Op);
9867 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9868 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9869 }
9870 break;
9871 case X86ISD::VPERMI: {
9872 if (Op == ExpectedOp) {
9874 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9875 SDValue Src = Op.getOperand(0);
9876 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9877 Mask[ExpectedIdx]);
9878 }
9879 break;
9880 }
9881 case X86ISD::HADD:
9882 case X86ISD::HSUB:
9883 case X86ISD::FHADD:
9884 case X86ISD::FHSUB:
9885 case X86ISD::PACKSS:
9886 case X86ISD::PACKUS:
9887 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9888 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9889 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9890 int NumElts = VT.getVectorNumElements();
9891 int NumLanes = VT.getSizeInBits() / 128;
9892 int NumEltsPerLane = NumElts / NumLanes;
9893 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9894 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9895 bool SameElt =
9896 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9897 return SameLane && SameElt;
9898 }
9899 break;
9900 }
9901
9902 return false;
9903}
9904
9905/// Tiny helper function to identify a no-op mask.
9906///
9907/// This is a somewhat boring predicate function. It checks whether the mask
9908/// array input, which is assumed to be a single-input shuffle mask of the kind
9909/// used by the X86 shuffle instructions (not a fully general
9910/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9911/// in-place shuffle are 'no-op's.
9913 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9914 assert(Mask[i] >= -1 && "Out of bound mask element!");
9915 if (Mask[i] >= 0 && Mask[i] != i)
9916 return false;
9917 }
9918 return true;
9919}
9920
9921/// Test whether there are elements crossing LaneSizeInBits lanes in this
9922/// shuffle mask.
9923///
9924/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9925/// and we routinely test for these.
9926static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9927 unsigned ScalarSizeInBits,
9928 ArrayRef<int> Mask) {
9929 assert(LaneSizeInBits && ScalarSizeInBits &&
9930 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9931 "Illegal shuffle lane size");
9932 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9933 int Size = Mask.size();
9934 for (int i = 0; i < Size; ++i)
9935 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9936 return true;
9937 return false;
9938}
9939
9940/// Test whether there are elements crossing 128-bit lanes in this
9941/// shuffle mask.
9943 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9944}
9945
9946/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9947/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9948/// better support 'repeated mask + lane permute' style shuffles.
9949static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9950 unsigned ScalarSizeInBits,
9951 ArrayRef<int> Mask) {
9952 assert(LaneSizeInBits && ScalarSizeInBits &&
9953 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9954 "Illegal shuffle lane size");
9955 int NumElts = Mask.size();
9956 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9957 int NumLanes = NumElts / NumEltsPerLane;
9958 if (NumLanes > 1) {
9959 for (int i = 0; i != NumLanes; ++i) {
9960 int SrcLane = -1;
9961 for (int j = 0; j != NumEltsPerLane; ++j) {
9962 int M = Mask[(i * NumEltsPerLane) + j];
9963 if (M < 0)
9964 continue;
9965 int Lane = (M % NumElts) / NumEltsPerLane;
9966 if (SrcLane >= 0 && SrcLane != Lane)
9967 return true;
9968 SrcLane = Lane;
9969 }
9970 }
9971 }
9972 return false;
9973}
9974
9975/// Test whether a shuffle mask is equivalent within each sub-lane.
9976///
9977/// This checks a shuffle mask to see if it is performing the same
9978/// lane-relative shuffle in each sub-lane. This trivially implies
9979/// that it is also not lane-crossing. It may however involve a blend from the
9980/// same lane of a second vector.
9981///
9982/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9983/// non-trivial to compute in the face of undef lanes. The representation is
9984/// suitable for use with existing 128-bit shuffles as entries from the second
9985/// vector have been remapped to [LaneSize, 2*LaneSize).
9986static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9987 ArrayRef<int> Mask,
9988 SmallVectorImpl<int> &RepeatedMask) {
9989 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9990 RepeatedMask.assign(LaneSize, -1);
9991 int Size = Mask.size();
9992 for (int i = 0; i < Size; ++i) {
9993 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9994 if (Mask[i] < 0)
9995 continue;
9996 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9997 // This entry crosses lanes, so there is no way to model this shuffle.
9998 return false;
9999
10000 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10001 // Adjust second vector indices to start at LaneSize instead of Size.
10002 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10003 : Mask[i] % LaneSize + LaneSize;
10004 if (RepeatedMask[i % LaneSize] < 0)
10005 // This is the first non-undef entry in this slot of a 128-bit lane.
10006 RepeatedMask[i % LaneSize] = LocalM;
10007 else if (RepeatedMask[i % LaneSize] != LocalM)
10008 // Found a mismatch with the repeated mask.
10009 return false;
10010 }
10011 return true;
10012}
10013
10014/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10015static bool
10017 SmallVectorImpl<int> &RepeatedMask) {
10018 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10019}
10020
10021static bool
10023 SmallVector<int, 32> RepeatedMask;
10024 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10025}
10026
10027/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10028static bool
10030 SmallVectorImpl<int> &RepeatedMask) {
10031 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10032}
10033
10034/// Test whether a target shuffle mask is equivalent within each sub-lane.
10035/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10036static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10037 unsigned EltSizeInBits,
10038 ArrayRef<int> Mask,
10039 SmallVectorImpl<int> &RepeatedMask) {
10040 int LaneSize = LaneSizeInBits / EltSizeInBits;
10041 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10042 int Size = Mask.size();
10043 for (int i = 0; i < Size; ++i) {
10044 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10045 if (Mask[i] == SM_SentinelUndef)
10046 continue;
10047 if (Mask[i] == SM_SentinelZero) {
10048 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10049 return false;
10050 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10051 continue;
10052 }
10053 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10054 // This entry crosses lanes, so there is no way to model this shuffle.
10055 return false;
10056
10057 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10058 // later vector indices to start at multiples of LaneSize instead of Size.
10059 int LaneM = Mask[i] / Size;
10060 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10061 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10062 // This is the first non-undef entry in this slot of a 128-bit lane.
10063 RepeatedMask[i % LaneSize] = LocalM;
10064 else if (RepeatedMask[i % LaneSize] != LocalM)
10065 // Found a mismatch with the repeated mask.
10066 return false;
10067 }
10068 return true;
10069}
10070
10071/// Test whether a target shuffle mask is equivalent within each sub-lane.
10072/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10073static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10074 ArrayRef<int> Mask,
10075 SmallVectorImpl<int> &RepeatedMask) {
10076 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10077 Mask, RepeatedMask);
10078}
10079
10080/// Checks whether a shuffle mask is equivalent to an explicit list of
10081/// arguments.
10082///
10083/// This is a fast way to test a shuffle mask against a fixed pattern:
10084///
10085/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10086///
10087/// It returns true if the mask is exactly as wide as the argument list, and
10088/// each element of the mask is either -1 (signifying undef) or the value given
10089/// in the argument.
10090static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10091 SDValue V1 = SDValue(),
10092 SDValue V2 = SDValue()) {
10093 int Size = Mask.size();
10094 if (Size != (int)ExpectedMask.size())
10095 return false;
10096
10097 for (int i = 0; i < Size; ++i) {
10098 assert(Mask[i] >= -1 && "Out of bound mask element!");
10099 int MaskIdx = Mask[i];
10100 int ExpectedIdx = ExpectedMask[i];
10101 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10102 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10103 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10104 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10105 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10106 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10107 return false;
10108 }
10109 }
10110 return true;
10111}
10112
10113/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10114///
10115/// The masks must be exactly the same width.
10116///
10117/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10118/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10119///
10120/// SM_SentinelZero is accepted as a valid negative index but must match in
10121/// both, or via a known bits test.
10123 ArrayRef<int> ExpectedMask,
10124 const SelectionDAG &DAG,
10125 SDValue V1 = SDValue(),
10126 SDValue V2 = SDValue()) {
10127 int Size = Mask.size();
10128 if (Size != (int)ExpectedMask.size())
10129 return false;
10130 assert(llvm::all_of(ExpectedMask,
10131 [Size](int M) {
10132 return M == SM_SentinelZero ||
10133 isInRange(M, 0, 2 * Size);
10134 }) &&
10135 "Illegal target shuffle mask");
10136
10137 // Check for out-of-range target shuffle mask indices.
10138 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10139 return false;
10140
10141 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10142 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10143 !V1.getValueType().isVector()))
10144 V1 = SDValue();
10145 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10146 !V2.getValueType().isVector()))
10147 V2 = SDValue();
10148
10149 APInt ZeroV1 = APInt::getZero(Size);
10150 APInt ZeroV2 = APInt::getZero(Size);
10151
10152 for (int i = 0; i < Size; ++i) {
10153 int MaskIdx = Mask[i];
10154 int ExpectedIdx = ExpectedMask[i];
10155 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10156 continue;
10157 // If we failed to match an expected SM_SentinelZero then early out.
10158 if (ExpectedIdx < 0)
10159 return false;
10160 if (MaskIdx == SM_SentinelZero) {
10161 // If we need this expected index to be a zero element, then update the
10162 // relevant zero mask and perform the known bits at the end to minimize
10163 // repeated computes.
10164 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10165 if (ExpectedV &&
10166 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10167 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10168 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10169 ZeroMask.setBit(BitIdx);
10170 continue;
10171 }
10172 }
10173 if (MaskIdx >= 0) {
10174 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10175 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10176 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10177 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10178 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10179 continue;
10180 }
10181 return false;
10182 }
10183 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10184 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10185}
10186
10187// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10188// instructions.
10190 const SelectionDAG &DAG) {
10191 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10192 return false;
10193
10194 SmallVector<int, 8> Unpcklwd;
10195 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10196 /* Unary = */ false);
10197 SmallVector<int, 8> Unpckhwd;
10198 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10199 /* Unary = */ false);
10200 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10201 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10202 return IsUnpackwdMask;
10203}
10204
10206 const SelectionDAG &DAG) {
10207 // Create 128-bit vector type based on mask size.
10208 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10209 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10210
10211 // We can't assume a canonical shuffle mask, so try the commuted version too.
10212 SmallVector<int, 4> CommutedMask(Mask);
10214
10215 // Match any of unary/binary or low/high.
10216 for (unsigned i = 0; i != 4; ++i) {
10217 SmallVector<int, 16> UnpackMask;
10218 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10219 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10220 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10221 return true;
10222 }
10223 return false;
10224}
10225
10226/// Return true if a shuffle mask chooses elements identically in its top and
10227/// bottom halves. For example, any splat mask has the same top and bottom
10228/// halves. If an element is undefined in only one half of the mask, the halves
10229/// are not considered identical.
10231 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10232 unsigned HalfSize = Mask.size() / 2;
10233 for (unsigned i = 0; i != HalfSize; ++i) {
10234 if (Mask[i] != Mask[i + HalfSize])
10235 return false;
10236 }
10237 return true;
10238}
10239
10240/// Get a 4-lane 8-bit shuffle immediate for a mask.
10241///
10242/// This helper function produces an 8-bit shuffle immediate corresponding to
10243/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10244/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10245/// example.
10246///
10247/// NB: We rely heavily on "undef" masks preserving the input lane.
10248static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10249 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10250 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10251 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10252 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10253 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10254
10255 // If the mask only uses one non-undef element, then fully 'splat' it to
10256 // improve later broadcast matching.
10257 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10258 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10259
10260 int FirstElt = Mask[FirstIndex];
10261 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10262 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10263
10264 unsigned Imm = 0;
10265 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10266 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10267 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10268 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10269 return Imm;
10270}
10271
10273 SelectionDAG &DAG) {
10274 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10275}
10276
10277// Canonicalize SHUFPD mask to improve chances of further folding.
10278// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10279static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10280 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10281 "Unexpected SHUFPD mask size");
10282 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10283 "Unexpected SHUFPD mask elements");
10284
10285 // If the mask only uses one non-undef element, then fully 'splat' it to
10286 // improve later broadcast matching.
10287 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10288 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10289 "All undef shuffle mask");
10290
10291 int FirstElt = Mask[FirstIndex];
10292 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10293 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10294 unsigned Imm = 0;
10295 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10296 Imm |= FirstElt << I;
10297 return Imm;
10298 }
10299
10300 // Attempt to keep any undef elements in place to improve chances of the
10301 // shuffle becoming a (commutative) blend.
10302 unsigned Imm = 0;
10303 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10304 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10305
10306 return Imm;
10307}
10308
10310 SelectionDAG &DAG) {
10311 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10312}
10313
10314// The Shuffle result is as follow:
10315// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10316// Each Zeroable's element correspond to a particular Mask's element.
10317// As described in computeZeroableShuffleElements function.
10318//
10319// The function looks for a sub-mask that the nonzero elements are in
10320// increasing order. If such sub-mask exist. The function returns true.
10321static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10322 ArrayRef<int> Mask, const EVT &VectorType,
10323 bool &IsZeroSideLeft) {
10324 int NextElement = -1;
10325 // Check if the Mask's nonzero elements are in increasing order.
10326 for (int i = 0, e = Mask.size(); i < e; i++) {
10327 // Checks if the mask's zeros elements are built from only zeros.
10328 assert(Mask[i] >= -1 && "Out of bound mask element!");
10329 if (Mask[i] < 0)
10330 return false;
10331 if (Zeroable[i])
10332 continue;
10333 // Find the lowest non zero element
10334 if (NextElement < 0) {
10335 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10336 IsZeroSideLeft = NextElement != 0;
10337 }
10338 // Exit if the mask's non zero elements are not in increasing order.
10339 if (NextElement != Mask[i])
10340 return false;
10341 NextElement++;
10342 }
10343 return true;
10344}
10345
10346static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10348 const X86Subtarget &Subtarget,
10349 unsigned Depth = 0);
10350
10351/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10353 ArrayRef<int> Mask, SDValue V1,
10354 SDValue V2, const APInt &Zeroable,
10355 const X86Subtarget &Subtarget,
10356 SelectionDAG &DAG) {
10357 int Size = Mask.size();
10358 int LaneSize = 128 / VT.getScalarSizeInBits();
10359 const int NumBytes = VT.getSizeInBits() / 8;
10360 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10361
10362 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10363 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10364 (Subtarget.hasBWI() && VT.is512BitVector()));
10365
10366 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10367 // Sign bit set in i8 mask means zero element.
10368 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10369
10370 SDValue V;
10371 for (int i = 0; i < NumBytes; ++i) {
10372 int M = Mask[i / NumEltBytes];
10373 if (M < 0) {
10374 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10375 continue;
10376 }
10377 if (Zeroable[i / NumEltBytes]) {
10378 PSHUFBMask[i] = ZeroMask;
10379 continue;
10380 }
10381
10382 // We can only use a single input of V1 or V2.
10383 SDValue SrcV = (M >= Size ? V2 : V1);
10384 if (V && V != SrcV)
10385 return SDValue();
10386 V = SrcV;
10387 M %= Size;
10388
10389 // PSHUFB can't cross lanes, ensure this doesn't happen.
10390 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10391 return SDValue();
10392
10393 M = M % LaneSize;
10394 M = M * NumEltBytes + (i % NumEltBytes);
10395 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10396 }
10397 assert(V && "Failed to find a source input");
10398
10399 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10400 return DAG.getBitcast(
10401 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10402 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10403}
10404
10405static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10406 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10407 const SDLoc &dl);
10408
10409// X86 has dedicated shuffle that can be lowered to VEXPAND
10411 SDValue V2, ArrayRef<int> Mask,
10412 const APInt &Zeroable,
10413 const X86Subtarget &Subtarget,
10414 SelectionDAG &DAG) {
10415 bool IsLeftZeroSide = true;
10416 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10417 IsLeftZeroSide))
10418 return SDValue();
10419 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10421 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10422 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10423 unsigned NumElts = VT.getVectorNumElements();
10424 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10425 "Unexpected number of vector elements");
10426 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10427 Subtarget, DAG, DL);
10428 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10429 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10430 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10431}
10432
10433static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10434 unsigned &UnpackOpcode, bool IsUnary,
10435 ArrayRef<int> TargetMask, const SDLoc &DL,
10436 SelectionDAG &DAG,
10437 const X86Subtarget &Subtarget) {
10438 int NumElts = VT.getVectorNumElements();
10439
10440 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10441 for (int i = 0; i != NumElts; i += 2) {
10442 int M1 = TargetMask[i + 0];
10443 int M2 = TargetMask[i + 1];
10444 Undef1 &= (SM_SentinelUndef == M1);
10445 Undef2 &= (SM_SentinelUndef == M2);
10446 Zero1 &= isUndefOrZero(M1);
10447 Zero2 &= isUndefOrZero(M2);
10448 }
10449 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10450 "Zeroable shuffle detected");
10451
10452 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10453 SmallVector<int, 64> Unpckl, Unpckh;
10454 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10455 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10456 (IsUnary ? V1 : V2))) {
10457 UnpackOpcode = X86ISD::UNPCKL;
10458 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10459 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10460 return true;
10461 }
10462
10463 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10464 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10465 (IsUnary ? V1 : V2))) {
10466 UnpackOpcode = X86ISD::UNPCKH;
10467 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10468 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10469 return true;
10470 }
10471
10472 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10473 if (IsUnary && (Zero1 || Zero2)) {
10474 // Don't bother if we can blend instead.
10475 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10476 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10477 return false;
10478
10479 bool MatchLo = true, MatchHi = true;
10480 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10481 int M = TargetMask[i];
10482
10483 // Ignore if the input is known to be zero or the index is undef.
10484 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10485 (M == SM_SentinelUndef))
10486 continue;
10487
10488 MatchLo &= (M == Unpckl[i]);
10489 MatchHi &= (M == Unpckh[i]);
10490 }
10491
10492 if (MatchLo || MatchHi) {
10493 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10494 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10495 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10496 return true;
10497 }
10498 }
10499
10500 // If a binary shuffle, commute and try again.
10501 if (!IsUnary) {
10503 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10504 UnpackOpcode = X86ISD::UNPCKL;
10505 std::swap(V1, V2);
10506 return true;
10507 }
10508
10510 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10511 UnpackOpcode = X86ISD::UNPCKH;
10512 std::swap(V1, V2);
10513 return true;
10514 }
10515 }
10516
10517 return false;
10518}
10519
10520// X86 has dedicated unpack instructions that can handle specific blend
10521// operations: UNPCKH and UNPCKL.
10523 SDValue V2, ArrayRef<int> Mask,
10524 SelectionDAG &DAG) {
10525 SmallVector<int, 8> Unpckl;
10526 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10527 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10528 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10529
10530 SmallVector<int, 8> Unpckh;
10531 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10532 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10533 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10534
10535 // Commute and try again.
10537 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10539
10541 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10542 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10543
10544 return SDValue();
10545}
10546
10547/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10548/// followed by unpack 256-bit.
10550 SDValue V2, ArrayRef<int> Mask,
10551 SelectionDAG &DAG) {
10552 SmallVector<int, 32> Unpckl, Unpckh;
10553 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10554 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10555
10556 unsigned UnpackOpcode;
10557 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10558 UnpackOpcode = X86ISD::UNPCKL;
10559 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10560 UnpackOpcode = X86ISD::UNPCKH;
10561 else
10562 return SDValue();
10563
10564 // This is a "natural" unpack operation (rather than the 128-bit sectored
10565 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10566 // input in order to use the x86 instruction.
10567 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10568 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10569 V1 = DAG.getBitcast(VT, V1);
10570 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10571}
10572
10573// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10574// source into the lower elements and zeroing the upper elements.
10575static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10576 ArrayRef<int> Mask, const APInt &Zeroable,
10577 const X86Subtarget &Subtarget) {
10578 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10579 return false;
10580
10581 unsigned NumElts = Mask.size();
10582 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10583 unsigned MaxScale = 64 / EltSizeInBits;
10584
10585 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10586 unsigned SrcEltBits = EltSizeInBits * Scale;
10587 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10588 continue;
10589 unsigned NumSrcElts = NumElts / Scale;
10590 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10591 continue;
10592 unsigned UpperElts = NumElts - NumSrcElts;
10593 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10594 continue;
10595 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10596 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10597 DstVT = MVT::getIntegerVT(EltSizeInBits);
10598 if ((NumSrcElts * EltSizeInBits) >= 128) {
10599 // ISD::TRUNCATE
10600 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10601 } else {
10602 // X86ISD::VTRUNC
10603 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10604 }
10605 return true;
10606 }
10607
10608 return false;
10609}
10610
10611// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10612// element padding to the final DstVT.
10613static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10614 const X86Subtarget &Subtarget,
10615 SelectionDAG &DAG, bool ZeroUppers) {
10616 MVT SrcVT = Src.getSimpleValueType();
10617 MVT DstSVT = DstVT.getScalarType();
10618 unsigned NumDstElts = DstVT.getVectorNumElements();
10619 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10620 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10621
10622 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10623 return SDValue();
10624
10625 // Perform a direct ISD::TRUNCATE if possible.
10626 if (NumSrcElts == NumDstElts)
10627 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10628
10629 if (NumSrcElts > NumDstElts) {
10630 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10631 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10632 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10633 }
10634
10635 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10636 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10637 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10638 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10639 DstVT.getSizeInBits());
10640 }
10641
10642 // Non-VLX targets must truncate from a 512-bit type, so we need to
10643 // widen, truncate and then possibly extract the original subvector.
10644 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10645 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10646 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10647 }
10648
10649 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10650 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10651 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10652 if (DstVT != TruncVT)
10653 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10654 DstVT.getSizeInBits());
10655 return Trunc;
10656}
10657
10658// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10659//
10660// An example is the following:
10661//
10662// t0: ch = EntryToken
10663// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10664// t25: v4i32 = truncate t2
10665// t41: v8i16 = bitcast t25
10666// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10667// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10668// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10669// t18: v2i64 = bitcast t51
10670//
10671// One can just use a single vpmovdw instruction, without avx512vl we need to
10672// use the zmm variant and extract the lower subvector, padding with zeroes.
10673// TODO: Merge with lowerShuffleAsVTRUNC.
10675 SDValue V2, ArrayRef<int> Mask,
10676 const APInt &Zeroable,
10677 const X86Subtarget &Subtarget,
10678 SelectionDAG &DAG) {
10679 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10680 if (!Subtarget.hasAVX512())
10681 return SDValue();
10682
10683 unsigned NumElts = VT.getVectorNumElements();
10684 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10685 unsigned MaxScale = 64 / EltSizeInBits;
10686 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10687 unsigned SrcEltBits = EltSizeInBits * Scale;
10688 unsigned NumSrcElts = NumElts / Scale;
10689 unsigned UpperElts = NumElts - NumSrcElts;
10690 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10691 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10692 continue;
10693
10694 // Attempt to find a matching source truncation, but as a fall back VLX
10695 // cases can use the VPMOV directly.
10696 SDValue Src = peekThroughBitcasts(V1);
10697 if (Src.getOpcode() == ISD::TRUNCATE &&
10698 Src.getScalarValueSizeInBits() == SrcEltBits) {
10699 Src = Src.getOperand(0);
10700 } else if (Subtarget.hasVLX()) {
10701 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10702 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10703 Src = DAG.getBitcast(SrcVT, Src);
10704 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10705 if (Scale == 2 &&
10706 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10707 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10708 return SDValue();
10709 } else
10710 return SDValue();
10711
10712 // VPMOVWB is only available with avx512bw.
10713 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10714 return SDValue();
10715
10716 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10717 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10718 }
10719
10720 return SDValue();
10721}
10722
10723// Attempt to match binary shuffle patterns as a truncate.
10725 SDValue V2, ArrayRef<int> Mask,
10726 const APInt &Zeroable,
10727 const X86Subtarget &Subtarget,
10728 SelectionDAG &DAG) {
10729 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10730 "Unexpected VTRUNC type");
10731 if (!Subtarget.hasAVX512() ||
10732 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10733 return SDValue();
10734
10735 unsigned NumElts = VT.getVectorNumElements();
10736 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10737 unsigned MaxScale = 64 / EltSizeInBits;
10738 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10739 // TODO: Support non-BWI VPMOVWB truncations?
10740 unsigned SrcEltBits = EltSizeInBits * Scale;
10741 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10742 continue;
10743
10744 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10745 // Bail if the V2 elements are undef.
10746 unsigned NumHalfSrcElts = NumElts / Scale;
10747 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10748 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10749 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10750 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10751 continue;
10752
10753 // The elements beyond the truncation must be undef/zero.
10754 unsigned UpperElts = NumElts - NumSrcElts;
10755 if (UpperElts > 0 &&
10756 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10757 continue;
10758 bool UndefUppers =
10759 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10760
10761 // As we're using both sources then we need to concat them together
10762 // and truncate from the double-sized src.
10763 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10764
10765 // For offset truncations, ensure that the concat is cheap.
10766 SDValue Src =
10767 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10768 if (!Src) {
10769 if (Offset)
10770 continue;
10771 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10772 }
10773
10774 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10775 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10776 Src = DAG.getBitcast(SrcVT, Src);
10777
10778 // Shift the offset'd elements into place for the truncation.
10779 // TODO: Use getTargetVShiftByConstNode.
10780 if (Offset)
10781 Src = DAG.getNode(
10782 X86ISD::VSRLI, DL, SrcVT, Src,
10783 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10784
10785 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10786 }
10787 }
10788
10789 return SDValue();
10790}
10791
10792/// Check whether a compaction lowering can be done by dropping even/odd
10793/// elements and compute how many times even/odd elements must be dropped.
10794///
10795/// This handles shuffles which take every Nth element where N is a power of
10796/// two. Example shuffle masks:
10797///
10798/// (even)
10799/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10800/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10801/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10802/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10803/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10804/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10805///
10806/// (odd)
10807/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10808/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10809///
10810/// Any of these lanes can of course be undef.
10811///
10812/// This routine only supports N <= 3.
10813/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10814/// for larger N.
10815///
10816/// \returns N above, or the number of times even/odd elements must be dropped
10817/// if there is such a number. Otherwise returns zero.
10818static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10819 bool IsSingleInput) {
10820 // The modulus for the shuffle vector entries is based on whether this is
10821 // a single input or not.
10822 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10823 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10824 "We should only be called with masks with a power-of-2 size!");
10825
10826 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10827 int Offset = MatchEven ? 0 : 1;
10828
10829 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10830 // and 2^3 simultaneously. This is because we may have ambiguity with
10831 // partially undef inputs.
10832 bool ViableForN[3] = {true, true, true};
10833
10834 for (int i = 0, e = Mask.size(); i < e; ++i) {
10835 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10836 // want.
10837 if (Mask[i] < 0)
10838 continue;
10839
10840 bool IsAnyViable = false;
10841 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10842 if (ViableForN[j]) {
10843 uint64_t N = j + 1;
10844
10845 // The shuffle mask must be equal to (i * 2^N) % M.
10846 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10847 IsAnyViable = true;
10848 else
10849 ViableForN[j] = false;
10850 }
10851 // Early exit if we exhaust the possible powers of two.
10852 if (!IsAnyViable)
10853 break;
10854 }
10855
10856 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10857 if (ViableForN[j])
10858 return j + 1;
10859
10860 // Return 0 as there is no viable power of two.
10861 return 0;
10862}
10863
10864// X86 has dedicated pack instructions that can handle specific truncation
10865// operations: PACKSS and PACKUS.
10866// Checks for compaction shuffle masks if MaxStages > 1.
10867// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10868static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10869 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10870 const SelectionDAG &DAG,
10871 const X86Subtarget &Subtarget,
10872 unsigned MaxStages = 1) {
10873 unsigned NumElts = VT.getVectorNumElements();
10874 unsigned BitSize = VT.getScalarSizeInBits();
10875 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10876 "Illegal maximum compaction");
10877
10878 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10879 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10880 unsigned NumPackedBits = NumSrcBits - BitSize;
10881 N1 = peekThroughBitcasts(N1);
10882 N2 = peekThroughBitcasts(N2);
10883 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10884 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10885 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10886 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10887 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10888 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10889 return false;
10890 if (Subtarget.hasSSE41() || BitSize == 8) {
10891 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10892 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10893 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10894 V1 = N1;
10895 V2 = N2;
10896 SrcVT = PackVT;
10897 PackOpcode = X86ISD::PACKUS;
10898 return true;
10899 }
10900 }
10901 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10902 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10903 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10904 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10905 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10906 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10907 V1 = N1;
10908 V2 = N2;
10909 SrcVT = PackVT;
10910 PackOpcode = X86ISD::PACKSS;
10911 return true;
10912 }
10913 return false;
10914 };
10915
10916 // Attempt to match against wider and wider compaction patterns.
10917 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10918 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10919 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10920
10921 // Try binary shuffle.
10922 SmallVector<int, 32> BinaryMask;
10923 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10924 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10925 if (MatchPACK(V1, V2, PackVT))
10926 return true;
10927
10928 // Try unary shuffle.
10929 SmallVector<int, 32> UnaryMask;
10930 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10931 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10932 if (MatchPACK(V1, V1, PackVT))
10933 return true;
10934 }
10935
10936 return false;
10937}
10938
10940 SDValue V2, ArrayRef<int> Mask,
10941 const X86Subtarget &Subtarget,
10942 SelectionDAG &DAG) {
10943 MVT PackVT;
10944 unsigned PackOpcode;
10945 unsigned SizeBits = VT.getSizeInBits();
10946 unsigned EltBits = VT.getScalarSizeInBits();
10947 unsigned MaxStages = Log2_32(64 / EltBits);
10948 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10949 Subtarget, MaxStages))
10950 return SDValue();
10951
10952 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10953 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10954
10955 // Don't lower multi-stage packs on AVX512, truncation is better.
10956 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10957 return SDValue();
10958
10959 // Pack to the largest type possible:
10960 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10961 unsigned MaxPackBits = 16;
10962 if (CurrentEltBits > 16 &&
10963 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10964 MaxPackBits = 32;
10965
10966 // Repeatedly pack down to the target size.
10967 SDValue Res;
10968 for (unsigned i = 0; i != NumStages; ++i) {
10969 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10970 unsigned NumSrcElts = SizeBits / SrcEltBits;
10971 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10972 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10973 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10974 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10975 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10976 DAG.getBitcast(SrcVT, V2));
10977 V1 = V2 = Res;
10978 CurrentEltBits /= 2;
10979 }
10980 assert(Res && Res.getValueType() == VT &&
10981 "Failed to lower compaction shuffle");
10982 return Res;
10983}
10984
10985/// Try to emit a bitmask instruction for a shuffle.
10986///
10987/// This handles cases where we can model a blend exactly as a bitmask due to
10988/// one of the inputs being zeroable.
10990 SDValue V2, ArrayRef<int> Mask,
10991 const APInt &Zeroable,
10992 const X86Subtarget &Subtarget,
10993 SelectionDAG &DAG) {
10994 MVT MaskVT = VT;
10995 MVT EltVT = VT.getVectorElementType();
10996 SDValue Zero, AllOnes;
10997 // Use f64 if i64 isn't legal.
10998 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10999 EltVT = MVT::f64;
11000 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11001 }
11002
11003 MVT LogicVT = VT;
11004 if (EltVT.isFloatingPoint()) {
11005 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11006 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11007 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11008 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11009 } else {
11010 Zero = DAG.getConstant(0, DL, EltVT);
11011 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11012 }
11013
11014 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11015 SDValue V;
11016 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11017 if (Zeroable[i])
11018 continue;
11019 if (Mask[i] % Size != i)
11020 return SDValue(); // Not a blend.
11021 if (!V)
11022 V = Mask[i] < Size ? V1 : V2;
11023 else if (V != (Mask[i] < Size ? V1 : V2))
11024 return SDValue(); // Can only let one input through the mask.
11025
11026 VMaskOps[i] = AllOnes;
11027 }
11028 if (!V)
11029 return SDValue(); // No non-zeroable elements!
11030
11031 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11032 VMask = DAG.getBitcast(LogicVT, VMask);
11033 V = DAG.getBitcast(LogicVT, V);
11034 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11035 return DAG.getBitcast(VT, And);
11036}
11037
11038/// Try to emit a blend instruction for a shuffle using bit math.
11039///
11040/// This is used as a fallback approach when first class blend instructions are
11041/// unavailable. Currently it is only suitable for integer vectors, but could
11042/// be generalized for floating point vectors if desirable.
11044 SDValue V2, ArrayRef<int> Mask,
11045 SelectionDAG &DAG) {
11046 assert(VT.isInteger() && "Only supports integer vector types!");
11047 MVT EltVT = VT.getVectorElementType();
11048 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11049 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11051 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11052 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11053 return SDValue(); // Shuffled input!
11054 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11055 }
11056
11057 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11058 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11059}
11060
11062 SDValue PreservedSrc,
11063 const X86Subtarget &Subtarget,
11064 SelectionDAG &DAG);
11065
11068 const APInt &Zeroable, bool &ForceV1Zero,
11069 bool &ForceV2Zero, uint64_t &BlendMask) {
11070 bool V1IsZeroOrUndef =
11072 bool V2IsZeroOrUndef =
11074
11075 BlendMask = 0;
11076 ForceV1Zero = false, ForceV2Zero = false;
11077 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11078
11079 int NumElts = Mask.size();
11080 int NumLanes = VT.getSizeInBits() / 128;
11081 int NumEltsPerLane = NumElts / NumLanes;
11082 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11083
11084 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11085 // then ensure the blend mask part for that lane just references that input.
11086 bool ForceWholeLaneMasks =
11087 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11088
11089 // Attempt to generate the binary blend mask. If an input is zero then
11090 // we can use any lane.
11091 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11092 // Keep track of the inputs used per lane.
11093 bool LaneV1InUse = false;
11094 bool LaneV2InUse = false;
11095 uint64_t LaneBlendMask = 0;
11096 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11097 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11098 int M = Mask[Elt];
11099 if (M == SM_SentinelUndef)
11100 continue;
11101 if (M == Elt || (0 <= M && M < NumElts &&
11102 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11103 Mask[Elt] = Elt;
11104 LaneV1InUse = true;
11105 continue;
11106 }
11107 if (M == (Elt + NumElts) ||
11108 (NumElts <= M &&
11109 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11110 LaneBlendMask |= 1ull << LaneElt;
11111 Mask[Elt] = Elt + NumElts;
11112 LaneV2InUse = true;
11113 continue;
11114 }
11115 if (Zeroable[Elt]) {
11116 if (V1IsZeroOrUndef) {
11117 ForceV1Zero = true;
11118 Mask[Elt] = Elt;
11119 LaneV1InUse = true;
11120 continue;
11121 }
11122 if (V2IsZeroOrUndef) {
11123 ForceV2Zero = true;
11124 LaneBlendMask |= 1ull << LaneElt;
11125 Mask[Elt] = Elt + NumElts;
11126 LaneV2InUse = true;
11127 continue;
11128 }
11129 }
11130 return false;
11131 }
11132
11133 // If we only used V2 then splat the lane blend mask to avoid any demanded
11134 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11135 // blend mask bit).
11136 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11137 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11138
11139 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11140 }
11141 return true;
11142}
11143
11144/// Try to emit a blend instruction for a shuffle.
11145///
11146/// This doesn't do any checks for the availability of instructions for blending
11147/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11148/// be matched in the backend with the type given. What it does check for is
11149/// that the shuffle mask is a blend, or convertible into a blend with zero.
11151 SDValue V2, ArrayRef<int> Original,
11152 const APInt &Zeroable,
11153 const X86Subtarget &Subtarget,
11154 SelectionDAG &DAG) {
11155 uint64_t BlendMask = 0;
11156 bool ForceV1Zero = false, ForceV2Zero = false;
11157 SmallVector<int, 64> Mask(Original);
11158 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11159 BlendMask))
11160 return SDValue();
11161
11162 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11163 if (ForceV1Zero)
11164 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11165 if (ForceV2Zero)
11166 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11167
11168 unsigned NumElts = VT.getVectorNumElements();
11169
11170 switch (VT.SimpleTy) {
11171 case MVT::v4i64:
11172 case MVT::v8i32:
11173 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11174 [[fallthrough]];
11175 case MVT::v4f64:
11176 case MVT::v8f32:
11177 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11178 [[fallthrough]];
11179 case MVT::v2f64:
11180 case MVT::v2i64:
11181 case MVT::v4f32:
11182 case MVT::v4i32:
11183 case MVT::v8i16:
11184 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11185 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11186 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11187 case MVT::v16i16: {
11188 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11189 SmallVector<int, 8> RepeatedMask;
11190 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11191 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11192 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11193 BlendMask = 0;
11194 for (int i = 0; i < 8; ++i)
11195 if (RepeatedMask[i] >= 8)
11196 BlendMask |= 1ull << i;
11197 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11198 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11199 }
11200 // Use PBLENDW for lower/upper lanes and then blend lanes.
11201 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11202 // merge to VSELECT where useful.
11203 uint64_t LoMask = BlendMask & 0xFF;
11204 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11205 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11206 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11207 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11208 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11209 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11210 return DAG.getVectorShuffle(
11211 MVT::v16i16, DL, Lo, Hi,
11212 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11213 }
11214 [[fallthrough]];
11215 }
11216 case MVT::v32i8:
11217 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11218 [[fallthrough]];
11219 case MVT::v16i8: {
11220 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11221
11222 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11223 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11224 Subtarget, DAG))
11225 return Masked;
11226
11227 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11228 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11229 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11230 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11231 }
11232
11233 // If we have VPTERNLOG, we can use that as a bit blend.
11234 if (Subtarget.hasVLX())
11235 if (SDValue BitBlend =
11236 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11237 return BitBlend;
11238
11239 // Scale the blend by the number of bytes per element.
11240 int Scale = VT.getScalarSizeInBits() / 8;
11241
11242 // This form of blend is always done on bytes. Compute the byte vector
11243 // type.
11244 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11245
11246 // x86 allows load folding with blendvb from the 2nd source operand. But
11247 // we are still using LLVM select here (see comment below), so that's V1.
11248 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11249 // allow that load-folding possibility.
11250 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11252 std::swap(V1, V2);
11253 }
11254
11255 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11256 // mix of LLVM's code generator and the x86 backend. We tell the code
11257 // generator that boolean values in the elements of an x86 vector register
11258 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11259 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11260 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11261 // of the element (the remaining are ignored) and 0 in that high bit would
11262 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11263 // the LLVM model for boolean values in vector elements gets the relevant
11264 // bit set, it is set backwards and over constrained relative to x86's
11265 // actual model.
11266 SmallVector<SDValue, 32> VSELECTMask;
11267 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11268 for (int j = 0; j < Scale; ++j)
11269 VSELECTMask.push_back(
11270 Mask[i] < 0
11271 ? DAG.getUNDEF(MVT::i8)
11272 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11273
11274 V1 = DAG.getBitcast(BlendVT, V1);
11275 V2 = DAG.getBitcast(BlendVT, V2);
11276 return DAG.getBitcast(
11277 VT,
11278 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11279 V1, V2));
11280 }
11281 case MVT::v16f32:
11282 case MVT::v8f64:
11283 case MVT::v8i64:
11284 case MVT::v16i32:
11285 case MVT::v32i16:
11286 case MVT::v64i8: {
11287 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11288 bool OptForSize = DAG.shouldOptForSize();
11289 if (!OptForSize) {
11290 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11291 Subtarget, DAG))
11292 return Masked;
11293 }
11294
11295 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11296 // masked move.
11297 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11298 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11299 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11300 }
11301 default:
11302 llvm_unreachable("Not a supported integer vector type!");
11303 }
11304}
11305
11306/// Try to lower as a blend of elements from two inputs followed by
11307/// a single-input permutation.
11308///
11309/// This matches the pattern where we can blend elements from two inputs and
11310/// then reduce the shuffle to a single-input permutation.
11312 SDValue V1, SDValue V2,
11313 ArrayRef<int> Mask,
11314 SelectionDAG &DAG,
11315 bool ImmBlends = false) {
11316 // We build up the blend mask while checking whether a blend is a viable way
11317 // to reduce the shuffle.
11318 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11319 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11320
11321 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11322 if (Mask[i] < 0)
11323 continue;
11324
11325 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11326
11327 if (BlendMask[Mask[i] % Size] < 0)
11328 BlendMask[Mask[i] % Size] = Mask[i];
11329 else if (BlendMask[Mask[i] % Size] != Mask[i])
11330 return SDValue(); // Can't blend in the needed input!
11331
11332 PermuteMask[i] = Mask[i] % Size;
11333 }
11334
11335 // If only immediate blends, then bail if the blend mask can't be widened to
11336 // i16.
11337 unsigned EltSize = VT.getScalarSizeInBits();
11338 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11339 return SDValue();
11340
11341 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11342 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11343}
11344
11345/// Try to lower as an unpack of elements from two inputs followed by
11346/// a single-input permutation.
11347///
11348/// This matches the pattern where we can unpack elements from two inputs and
11349/// then reduce the shuffle to a single-input (wider) permutation.
11351 SDValue V1, SDValue V2,
11352 ArrayRef<int> Mask,
11353 SelectionDAG &DAG) {
11354 int NumElts = Mask.size();
11355 int NumLanes = VT.getSizeInBits() / 128;
11356 int NumLaneElts = NumElts / NumLanes;
11357 int NumHalfLaneElts = NumLaneElts / 2;
11358
11359 bool MatchLo = true, MatchHi = true;
11360 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11361
11362 // Determine UNPCKL/UNPCKH type and operand order.
11363 for (int Elt = 0; Elt != NumElts; ++Elt) {
11364 int M = Mask[Elt];
11365 if (M < 0)
11366 continue;
11367
11368 // Normalize the mask value depending on whether it's V1 or V2.
11369 int NormM = M;
11370 SDValue &Op = Ops[Elt & 1];
11371 if (M < NumElts && (Op.isUndef() || Op == V1))
11372 Op = V1;
11373 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11374 Op = V2;
11375 NormM -= NumElts;
11376 } else
11377 return SDValue();
11378
11379 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11380 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11381 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11382 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11383 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11384 if (MatchLoAnyLane || MatchHiAnyLane) {
11385 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11386 "Failed to match UNPCKLO/UNPCKHI");
11387 break;
11388 }
11389 }
11390 MatchLo &= MatchLoAnyLane;
11391 MatchHi &= MatchHiAnyLane;
11392 if (!MatchLo && !MatchHi)
11393 return SDValue();
11394 }
11395 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11396
11397 // Element indices have changed after unpacking. Calculate permute mask
11398 // so that they will be put back to the position as dictated by the
11399 // original shuffle mask indices.
11400 SmallVector<int, 32> PermuteMask(NumElts, -1);
11401 for (int Elt = 0; Elt != NumElts; ++Elt) {
11402 int M = Mask[Elt];
11403 if (M < 0)
11404 continue;
11405 int NormM = M;
11406 if (NumElts <= M)
11407 NormM -= NumElts;
11408 bool IsFirstOp = M < NumElts;
11409 int BaseMaskElt =
11410 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11411 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11412 PermuteMask[Elt] = BaseMaskElt;
11413 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11414 PermuteMask[Elt] = BaseMaskElt + 1;
11415 assert(PermuteMask[Elt] != -1 &&
11416 "Input mask element is defined but failed to assign permute mask");
11417 }
11418
11419 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11420 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11421 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11422}
11423
11424/// Try to lower a shuffle as a permute of the inputs followed by an
11425/// UNPCK instruction.
11426///
11427/// This specifically targets cases where we end up with alternating between
11428/// the two inputs, and so can permute them into something that feeds a single
11429/// UNPCK instruction. Note that this routine only targets integer vectors
11430/// because for floating point vectors we have a generalized SHUFPS lowering
11431/// strategy that handles everything that doesn't *exactly* match an unpack,
11432/// making this clever lowering unnecessary.
11434 SDValue V1, SDValue V2,
11435 ArrayRef<int> Mask,
11436 const X86Subtarget &Subtarget,
11437 SelectionDAG &DAG) {
11438 int Size = Mask.size();
11439 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11440
11441 // This routine only supports 128-bit integer dual input vectors.
11442 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11443 return SDValue();
11444
11445 int NumLoInputs =
11446 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11447 int NumHiInputs =
11448 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11449
11450 bool UnpackLo = NumLoInputs >= NumHiInputs;
11451
11452 auto TryUnpack = [&](int ScalarSize, int Scale) {
11453 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11454 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11455
11456 for (int i = 0; i < Size; ++i) {
11457 if (Mask[i] < 0)
11458 continue;
11459
11460 // Each element of the unpack contains Scale elements from this mask.
11461 int UnpackIdx = i / Scale;
11462
11463 // We only handle the case where V1 feeds the first slots of the unpack.
11464 // We rely on canonicalization to ensure this is the case.
11465 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11466 return SDValue();
11467
11468 // Setup the mask for this input. The indexing is tricky as we have to
11469 // handle the unpack stride.
11470 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11471 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11472 Mask[i] % Size;
11473 }
11474
11475 // If we will have to shuffle both inputs to use the unpack, check whether
11476 // we can just unpack first and shuffle the result. If so, skip this unpack.
11477 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11478 !isNoopShuffleMask(V2Mask))
11479 return SDValue();
11480
11481 // Shuffle the inputs into place.
11482 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11483 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11484
11485 // Cast the inputs to the type we will use to unpack them.
11486 MVT UnpackVT =
11487 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11488 V1 = DAG.getBitcast(UnpackVT, V1);
11489 V2 = DAG.getBitcast(UnpackVT, V2);
11490
11491 // Unpack the inputs and cast the result back to the desired type.
11492 return DAG.getBitcast(
11493 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11494 UnpackVT, V1, V2));
11495 };
11496
11497 // We try each unpack from the largest to the smallest to try and find one
11498 // that fits this mask.
11499 int OrigScalarSize = VT.getScalarSizeInBits();
11500 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11501 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11502 return Unpack;
11503
11504 // If we're shuffling with a zero vector then we're better off not doing
11505 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11508 return SDValue();
11509
11510 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11511 // initial unpack.
11512 if (NumLoInputs == 0 || NumHiInputs == 0) {
11513 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11514 "We have to have *some* inputs!");
11515 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11516
11517 // FIXME: We could consider the total complexity of the permute of each
11518 // possible unpacking. Or at the least we should consider how many
11519 // half-crossings are created.
11520 // FIXME: We could consider commuting the unpacks.
11521
11522 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11523 for (int i = 0; i < Size; ++i) {
11524 if (Mask[i] < 0)
11525 continue;
11526
11527 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11528
11529 PermMask[i] =
11530 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11531 }
11532 return DAG.getVectorShuffle(
11533 VT, DL,
11534 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11535 V1, V2),
11536 DAG.getUNDEF(VT), PermMask);
11537 }
11538
11539 return SDValue();
11540}
11541
11542/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11543/// permuting the elements of the result in place.
11545 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11546 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11547 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11548 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11549 (VT.is512BitVector() && !Subtarget.hasBWI()))
11550 return SDValue();
11551
11552 // We don't currently support lane crossing permutes.
11553 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11554 return SDValue();
11555
11556 int Scale = VT.getScalarSizeInBits() / 8;
11557 int NumLanes = VT.getSizeInBits() / 128;
11558 int NumElts = VT.getVectorNumElements();
11559 int NumEltsPerLane = NumElts / NumLanes;
11560
11561 // Determine range of mask elts.
11562 bool Blend1 = true;
11563 bool Blend2 = true;
11564 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11565 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11566 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11567 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11568 int M = Mask[Lane + Elt];
11569 if (M < 0)
11570 continue;
11571 if (M < NumElts) {
11572 Blend1 &= (M == (Lane + Elt));
11573 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11574 M = M % NumEltsPerLane;
11575 Range1.first = std::min(Range1.first, M);
11576 Range1.second = std::max(Range1.second, M);
11577 } else {
11578 M -= NumElts;
11579 Blend2 &= (M == (Lane + Elt));
11580 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11581 M = M % NumEltsPerLane;
11582 Range2.first = std::min(Range2.first, M);
11583 Range2.second = std::max(Range2.second, M);
11584 }
11585 }
11586 }
11587
11588 // Bail if we don't need both elements.
11589 // TODO - it might be worth doing this for unary shuffles if the permute
11590 // can be widened.
11591 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11592 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11593 return SDValue();
11594
11595 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11596 return SDValue();
11597
11598 // Rotate the 2 ops so we can access both ranges, then permute the result.
11599 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11600 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11601 SDValue Rotate = DAG.getBitcast(
11602 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11603 DAG.getBitcast(ByteVT, Lo),
11604 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11605 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11606 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11607 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11608 int M = Mask[Lane + Elt];
11609 if (M < 0)
11610 continue;
11611 if (M < NumElts)
11612 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11613 else
11614 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11615 }
11616 }
11617 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11618 };
11619
11620 // Check if the ranges are small enough to rotate from either direction.
11621 if (Range2.second < Range1.first)
11622 return RotateAndPermute(V1, V2, Range1.first, 0);
11623 if (Range1.second < Range2.first)
11624 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11625 return SDValue();
11626}
11627
11629 return isUndefOrEqual(Mask, 0);
11630}
11631
11633 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11634}
11635
11636/// Check if the Mask consists of the same element repeated multiple times.
11638 size_t NumUndefs = 0;
11639 std::optional<int> UniqueElt;
11640 for (int Elt : Mask) {
11641 if (Elt == SM_SentinelUndef) {
11642 NumUndefs++;
11643 continue;
11644 }
11645 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11646 return false;
11647 UniqueElt = Elt;
11648 }
11649 // Make sure the element is repeated enough times by checking the number of
11650 // undefs is small.
11651 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11652}
11653
11654/// Generic routine to decompose a shuffle and blend into independent
11655/// blends and permutes.
11656///
11657/// This matches the extremely common pattern for handling combined
11658/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11659/// operations. It will try to pick the best arrangement of shuffles and
11660/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11662 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11663 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11664 int NumElts = Mask.size();
11665 int NumLanes = VT.getSizeInBits() / 128;
11666 int NumEltsPerLane = NumElts / NumLanes;
11667
11668 // Shuffle the input elements into the desired positions in V1 and V2 and
11669 // unpack/blend them together.
11670 bool IsAlternating = true;
11671 bool V1Zero = true, V2Zero = true;
11672 SmallVector<int, 32> V1Mask(NumElts, -1);
11673 SmallVector<int, 32> V2Mask(NumElts, -1);
11674 SmallVector<int, 32> FinalMask(NumElts, -1);
11675 for (int i = 0; i < NumElts; ++i) {
11676 int M = Mask[i];
11677 if (M >= 0 && M < NumElts) {
11678 V1Mask[i] = M;
11679 FinalMask[i] = i;
11680 V1Zero &= Zeroable[i];
11681 IsAlternating &= (i & 1) == 0;
11682 } else if (M >= NumElts) {
11683 V2Mask[i] = M - NumElts;
11684 FinalMask[i] = i + NumElts;
11685 V2Zero &= Zeroable[i];
11686 IsAlternating &= (i & 1) == 1;
11687 }
11688 }
11689
11690 // If we effectively only demand the 0'th element of \p Input, and not only
11691 // as 0'th element, then broadcast said input,
11692 // and change \p InputMask to be a no-op (identity) mask.
11693 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11694 &DAG](SDValue &Input,
11695 MutableArrayRef<int> InputMask) {
11696 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11697 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11698 !X86::mayFoldLoad(Input, Subtarget)))
11699 return;
11700 if (isNoopShuffleMask(InputMask))
11701 return;
11702 assert(isBroadcastShuffleMask(InputMask) &&
11703 "Expected to demand only the 0'th element.");
11704 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11705 for (auto I : enumerate(InputMask)) {
11706 int &InputMaskElt = I.value();
11707 if (InputMaskElt >= 0)
11708 InputMaskElt = I.index();
11709 }
11710 };
11711
11712 // Currently, we may need to produce one shuffle per input, and blend results.
11713 // It is possible that the shuffle for one of the inputs is already a no-op.
11714 // See if we can simplify non-no-op shuffles into broadcasts,
11715 // which we consider to be strictly better than an arbitrary shuffle.
11716 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11718 canonicalizeBroadcastableInput(V1, V1Mask);
11719 canonicalizeBroadcastableInput(V2, V2Mask);
11720 }
11721
11722 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11723 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11724 // the shuffle may be able to fold with a load or other benefit. However, when
11725 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11726 // pre-shuffle first is a better strategy.
11727 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11728 // Only prefer immediate blends to unpack/rotate.
11729 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11730 DAG, true))
11731 return BlendPerm;
11732 // If either input vector provides only a single element which is repeated
11733 // multiple times, unpacking from both input vectors would generate worse
11734 // code. e.g. for
11735 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11736 // it is better to process t4 first to create a vector of t4[0], then unpack
11737 // that vector with t2.
11738 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11740 if (SDValue UnpackPerm =
11741 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11742 return UnpackPerm;
11744 DL, VT, V1, V2, Mask, Subtarget, DAG))
11745 return RotatePerm;
11746 // Unpack/rotate failed - try again with variable blends.
11747 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11748 DAG))
11749 return BlendPerm;
11750 if (VT.getScalarSizeInBits() >= 32)
11751 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11752 DL, VT, V1, V2, Mask, Subtarget, DAG))
11753 return PermUnpack;
11754 }
11755
11756 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11757 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11758 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11759 // than half the elements coming from each source.
11760 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11761 V1Mask.assign(NumElts, -1);
11762 V2Mask.assign(NumElts, -1);
11763 FinalMask.assign(NumElts, -1);
11764 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11765 for (int j = 0; j != NumEltsPerLane; ++j) {
11766 int M = Mask[i + j];
11767 if (M >= 0 && M < NumElts) {
11768 V1Mask[i + (j / 2)] = M;
11769 FinalMask[i + j] = i + (j / 2);
11770 } else if (M >= NumElts) {
11771 V2Mask[i + (j / 2)] = M - NumElts;
11772 FinalMask[i + j] = i + (j / 2) + NumElts;
11773 }
11774 }
11775 }
11776
11777 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11778 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11779 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11780}
11781
11782static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11783 const X86Subtarget &Subtarget,
11784 ArrayRef<int> Mask) {
11785 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11786 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11787
11788 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11789 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11790 int MaxSubElts = 64 / EltSizeInBits;
11791 unsigned RotateAmt, NumSubElts;
11792 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11793 MaxSubElts, NumSubElts, RotateAmt))
11794 return -1;
11795 unsigned NumElts = Mask.size();
11796 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11797 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11798 return RotateAmt;
11799}
11800
11801/// Lower shuffle using X86ISD::VROTLI rotations.
11803 ArrayRef<int> Mask,
11804 const X86Subtarget &Subtarget,
11805 SelectionDAG &DAG) {
11806 // Only XOP + AVX512 targets have bit rotation instructions.
11807 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11808 bool IsLegal =
11809 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11810 if (!IsLegal && Subtarget.hasSSE3())
11811 return SDValue();
11812
11813 MVT RotateVT;
11814 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11815 Subtarget, Mask);
11816 if (RotateAmt < 0)
11817 return SDValue();
11818
11819 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11820 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11821 // widen to vXi16 or more then existing lowering should will be better.
11822 if (!IsLegal) {
11823 if ((RotateAmt % 16) == 0)
11824 return SDValue();
11825 // TODO: Use getTargetVShiftByConstNode.
11826 unsigned ShlAmt = RotateAmt;
11827 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11828 V1 = DAG.getBitcast(RotateVT, V1);
11829 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11830 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11831 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11832 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11833 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11834 return DAG.getBitcast(VT, Rot);
11835 }
11836
11837 SDValue Rot =
11838 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11839 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11840 return DAG.getBitcast(VT, Rot);
11841}
11842
11843/// Try to match a vector shuffle as an element rotation.
11844///
11845/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11847 ArrayRef<int> Mask) {
11848 int NumElts = Mask.size();
11849
11850 // We need to detect various ways of spelling a rotation:
11851 // [11, 12, 13, 14, 15, 0, 1, 2]
11852 // [-1, 12, 13, 14, -1, -1, 1, -1]
11853 // [-1, -1, -1, -1, -1, -1, 1, 2]
11854 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11855 // [-1, 4, 5, 6, -1, -1, 9, -1]
11856 // [-1, 4, 5, 6, -1, -1, -1, -1]
11857 int Rotation = 0;
11858 SDValue Lo, Hi;
11859 for (int i = 0; i < NumElts; ++i) {
11860 int M = Mask[i];
11861 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11862 "Unexpected mask index.");
11863 if (M < 0)
11864 continue;
11865
11866 // Determine where a rotated vector would have started.
11867 int StartIdx = i - (M % NumElts);
11868 if (StartIdx == 0)
11869 // The identity rotation isn't interesting, stop.
11870 return -1;
11871
11872 // If we found the tail of a vector the rotation must be the missing
11873 // front. If we found the head of a vector, it must be how much of the
11874 // head.
11875 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11876
11877 if (Rotation == 0)
11878 Rotation = CandidateRotation;
11879 else if (Rotation != CandidateRotation)
11880 // The rotations don't match, so we can't match this mask.
11881 return -1;
11882
11883 // Compute which value this mask is pointing at.
11884 SDValue MaskV = M < NumElts ? V1 : V2;
11885
11886 // Compute which of the two target values this index should be assigned
11887 // to. This reflects whether the high elements are remaining or the low
11888 // elements are remaining.
11889 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11890
11891 // Either set up this value if we've not encountered it before, or check
11892 // that it remains consistent.
11893 if (!TargetV)
11894 TargetV = MaskV;
11895 else if (TargetV != MaskV)
11896 // This may be a rotation, but it pulls from the inputs in some
11897 // unsupported interleaving.
11898 return -1;
11899 }
11900
11901 // Check that we successfully analyzed the mask, and normalize the results.
11902 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11903 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11904 if (!Lo)
11905 Lo = Hi;
11906 else if (!Hi)
11907 Hi = Lo;
11908
11909 V1 = Lo;
11910 V2 = Hi;
11911
11912 return Rotation;
11913}
11914
11915/// Try to lower a vector shuffle as a byte rotation.
11916///
11917/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11918/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11919/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11920/// try to generically lower a vector shuffle through such an pattern. It
11921/// does not check for the profitability of lowering either as PALIGNR or
11922/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11923/// This matches shuffle vectors that look like:
11924///
11925/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11926///
11927/// Essentially it concatenates V1 and V2, shifts right by some number of
11928/// elements, and takes the low elements as the result. Note that while this is
11929/// specified as a *right shift* because x86 is little-endian, it is a *left
11930/// rotate* of the vector lanes.
11932 ArrayRef<int> Mask) {
11933 // Don't accept any shuffles with zero elements.
11934 if (isAnyZero(Mask))
11935 return -1;
11936
11937 // PALIGNR works on 128-bit lanes.
11938 SmallVector<int, 16> RepeatedMask;
11939 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11940 return -1;
11941
11942 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11943 if (Rotation <= 0)
11944 return -1;
11945
11946 // PALIGNR rotates bytes, so we need to scale the
11947 // rotation based on how many bytes are in the vector lane.
11948 int NumElts = RepeatedMask.size();
11949 int Scale = 16 / NumElts;
11950 return Rotation * Scale;
11951}
11952
11954 SDValue V2, ArrayRef<int> Mask,
11955 const X86Subtarget &Subtarget,
11956 SelectionDAG &DAG) {
11957 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11958
11959 SDValue Lo = V1, Hi = V2;
11960 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11961 if (ByteRotation <= 0)
11962 return SDValue();
11963
11964 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11965 // PSLLDQ/PSRLDQ.
11966 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11967 Lo = DAG.getBitcast(ByteVT, Lo);
11968 Hi = DAG.getBitcast(ByteVT, Hi);
11969
11970 // SSSE3 targets can use the palignr instruction.
11971 if (Subtarget.hasSSSE3()) {
11972 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11973 "512-bit PALIGNR requires BWI instructions");
11974 return DAG.getBitcast(
11975 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11976 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11977 }
11978
11979 assert(VT.is128BitVector() &&
11980 "Rotate-based lowering only supports 128-bit lowering!");
11981 assert(Mask.size() <= 16 &&
11982 "Can shuffle at most 16 bytes in a 128-bit vector!");
11983 assert(ByteVT == MVT::v16i8 &&
11984 "SSE2 rotate lowering only needed for v16i8!");
11985
11986 // Default SSE2 implementation
11987 int LoByteShift = 16 - ByteRotation;
11988 int HiByteShift = ByteRotation;
11989
11990 SDValue LoShift =
11991 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11992 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11993 SDValue HiShift =
11994 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11995 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11996 return DAG.getBitcast(VT,
11997 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11998}
11999
12000/// Try to lower a vector shuffle as a dword/qword rotation.
12001///
12002/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12003/// rotation of the concatenation of two vectors; This routine will
12004/// try to generically lower a vector shuffle through such an pattern.
12005///
12006/// Essentially it concatenates V1 and V2, shifts right by some number of
12007/// elements, and takes the low elements as the result. Note that while this is
12008/// specified as a *right shift* because x86 is little-endian, it is a *left
12009/// rotate* of the vector lanes.
12011 SDValue V2, ArrayRef<int> Mask,
12012 const APInt &Zeroable,
12013 const X86Subtarget &Subtarget,
12014 SelectionDAG &DAG) {
12015 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12016 "Only 32-bit and 64-bit elements are supported!");
12017
12018 // 128/256-bit vectors are only supported with VLX.
12019 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12020 && "VLX required for 128/256-bit vectors");
12021
12022 SDValue Lo = V1, Hi = V2;
12023 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12024 if (0 < Rotation)
12025 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12026 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12027
12028 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12029 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12030 // TODO: We can probably make this more aggressive and use shift-pairs like
12031 // lowerShuffleAsByteShiftMask.
12032 unsigned NumElts = Mask.size();
12033 unsigned ZeroLo = Zeroable.countr_one();
12034 unsigned ZeroHi = Zeroable.countl_one();
12035 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12036 if (!ZeroLo && !ZeroHi)
12037 return SDValue();
12038
12039 if (ZeroLo) {
12040 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12041 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12042 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12043 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12044 getZeroVector(VT, Subtarget, DAG, DL),
12045 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12046 }
12047
12048 if (ZeroHi) {
12049 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12050 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12051 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12052 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12053 getZeroVector(VT, Subtarget, DAG, DL), Src,
12054 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12055 }
12056
12057 return SDValue();
12058}
12059
12060/// Try to lower a vector shuffle as a byte shift sequence.
12062 SDValue V2, ArrayRef<int> Mask,
12063 const APInt &Zeroable,
12064 const X86Subtarget &Subtarget,
12065 SelectionDAG &DAG) {
12066 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12067 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12068
12069 // We need a shuffle that has zeros at one/both ends and a sequential
12070 // shuffle from one source within.
12071 unsigned ZeroLo = Zeroable.countr_one();
12072 unsigned ZeroHi = Zeroable.countl_one();
12073 if (!ZeroLo && !ZeroHi)
12074 return SDValue();
12075
12076 unsigned NumElts = Mask.size();
12077 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12078 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12079 return SDValue();
12080
12081 unsigned Scale = VT.getScalarSizeInBits() / 8;
12082 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12083 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12084 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12085 return SDValue();
12086
12087 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12088 Res = DAG.getBitcast(MVT::v16i8, Res);
12089
12090 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12091 // inner sequential set of elements, possibly offset:
12092 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12093 // 01234567 --> 4567zzzz --> zzzzz456
12094 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12095 if (ZeroLo == 0) {
12096 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12097 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12098 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12099 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12100 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12101 } else if (ZeroHi == 0) {
12102 unsigned Shift = Mask[ZeroLo] % NumElts;
12103 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12104 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12105 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12106 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12107 } else if (!Subtarget.hasSSSE3()) {
12108 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12109 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12110 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12111 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12112 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12113 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12114 Shift += Mask[ZeroLo] % NumElts;
12115 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12116 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12117 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12118 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12119 } else
12120 return SDValue();
12121
12122 return DAG.getBitcast(VT, Res);
12123}
12124
12125/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12126///
12127/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12128/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12129/// matches elements from one of the input vectors shuffled to the left or
12130/// right with zeroable elements 'shifted in'. It handles both the strictly
12131/// bit-wise element shifts and the byte shift across an entire 128-bit double
12132/// quad word lane.
12133///
12134/// PSHL : (little-endian) left bit shift.
12135/// [ zz, 0, zz, 2 ]
12136/// [ -1, 4, zz, -1 ]
12137/// PSRL : (little-endian) right bit shift.
12138/// [ 1, zz, 3, zz]
12139/// [ -1, -1, 7, zz]
12140/// PSLLDQ : (little-endian) left byte shift
12141/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12142/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12143/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12144/// PSRLDQ : (little-endian) right byte shift
12145/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12146/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12147/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12148static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12149 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12150 int MaskOffset, const APInt &Zeroable,
12151 const X86Subtarget &Subtarget) {
12152 int Size = Mask.size();
12153 unsigned SizeInBits = Size * ScalarSizeInBits;
12154
12155 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12156 for (int i = 0; i < Size; i += Scale)
12157 for (int j = 0; j < Shift; ++j)
12158 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12159 return false;
12160
12161 return true;
12162 };
12163
12164 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12165 for (int i = 0; i != Size; i += Scale) {
12166 unsigned Pos = Left ? i + Shift : i;
12167 unsigned Low = Left ? i : i + Shift;
12168 unsigned Len = Scale - Shift;
12169 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12170 return -1;
12171 }
12172
12173 int ShiftEltBits = ScalarSizeInBits * Scale;
12174 bool ByteShift = ShiftEltBits > 64;
12175 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12176 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12177 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12178
12179 // Normalize the scale for byte shifts to still produce an i64 element
12180 // type.
12181 Scale = ByteShift ? Scale / 2 : Scale;
12182
12183 // We need to round trip through the appropriate type for the shift.
12184 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12185 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12186 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12187 return (int)ShiftAmt;
12188 };
12189
12190 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12191 // keep doubling the size of the integer elements up to that. We can
12192 // then shift the elements of the integer vector by whole multiples of
12193 // their width within the elements of the larger integer vector. Test each
12194 // multiple to see if we can find a match with the moved element indices
12195 // and that the shifted in elements are all zeroable.
12196 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12197 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12198 for (int Shift = 1; Shift != Scale; ++Shift)
12199 for (bool Left : {true, false})
12200 if (CheckZeros(Shift, Scale, Left)) {
12201 int ShiftAmt = MatchShift(Shift, Scale, Left);
12202 if (0 < ShiftAmt)
12203 return ShiftAmt;
12204 }
12205
12206 // no match
12207 return -1;
12208}
12209
12211 SDValue V2, ArrayRef<int> Mask,
12212 const APInt &Zeroable,
12213 const X86Subtarget &Subtarget,
12214 SelectionDAG &DAG, bool BitwiseOnly) {
12215 int Size = Mask.size();
12216 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12217
12218 MVT ShiftVT;
12219 SDValue V = V1;
12220 unsigned Opcode;
12221
12222 // Try to match shuffle against V1 shift.
12223 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12224 Mask, 0, Zeroable, Subtarget);
12225
12226 // If V1 failed, try to match shuffle against V2 shift.
12227 if (ShiftAmt < 0) {
12228 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12229 Mask, Size, Zeroable, Subtarget);
12230 V = V2;
12231 }
12232
12233 if (ShiftAmt < 0)
12234 return SDValue();
12235
12236 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12237 return SDValue();
12238
12239 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12240 "Illegal integer vector type");
12241 V = DAG.getBitcast(ShiftVT, V);
12242 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12243 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12244 return DAG.getBitcast(VT, V);
12245}
12246
12247// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12248// Remainder of lower half result is zero and upper half is all undef.
12249static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12250 ArrayRef<int> Mask, uint64_t &BitLen,
12251 uint64_t &BitIdx, const APInt &Zeroable) {
12252 int Size = Mask.size();
12253 int HalfSize = Size / 2;
12254 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12255 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12256
12257 // Upper half must be undefined.
12258 if (!isUndefUpperHalf(Mask))
12259 return false;
12260
12261 // Determine the extraction length from the part of the
12262 // lower half that isn't zeroable.
12263 int Len = HalfSize;
12264 for (; Len > 0; --Len)
12265 if (!Zeroable[Len - 1])
12266 break;
12267 assert(Len > 0 && "Zeroable shuffle mask");
12268
12269 // Attempt to match first Len sequential elements from the lower half.
12270 SDValue Src;
12271 int Idx = -1;
12272 for (int i = 0; i != Len; ++i) {
12273 int M = Mask[i];
12274 if (M == SM_SentinelUndef)
12275 continue;
12276 SDValue &V = (M < Size ? V1 : V2);
12277 M = M % Size;
12278
12279 // The extracted elements must start at a valid index and all mask
12280 // elements must be in the lower half.
12281 if (i > M || M >= HalfSize)
12282 return false;
12283
12284 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12285 Src = V;
12286 Idx = M - i;
12287 continue;
12288 }
12289 return false;
12290 }
12291
12292 if (!Src || Idx < 0)
12293 return false;
12294
12295 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12296 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12297 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12298 V1 = Src;
12299 return true;
12300}
12301
12302// INSERTQ: Extract lowest Len elements from lower half of second source and
12303// insert over first source, starting at Idx.
12304// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12305static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12306 ArrayRef<int> Mask, uint64_t &BitLen,
12307 uint64_t &BitIdx) {
12308 int Size = Mask.size();
12309 int HalfSize = Size / 2;
12310 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12311
12312 // Upper half must be undefined.
12313 if (!isUndefUpperHalf(Mask))
12314 return false;
12315
12316 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12317 SDValue Base;
12318
12319 // Attempt to match first source from mask before insertion point.
12320 if (isUndefInRange(Mask, 0, Idx)) {
12321 /* EMPTY */
12322 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12323 Base = V1;
12324 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12325 Base = V2;
12326 } else {
12327 continue;
12328 }
12329
12330 // Extend the extraction length looking to match both the insertion of
12331 // the second source and the remaining elements of the first.
12332 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12333 SDValue Insert;
12334 int Len = Hi - Idx;
12335
12336 // Match insertion.
12337 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12338 Insert = V1;
12339 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12340 Insert = V2;
12341 } else {
12342 continue;
12343 }
12344
12345 // Match the remaining elements of the lower half.
12346 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12347 /* EMPTY */
12348 } else if ((!Base || (Base == V1)) &&
12349 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12350 Base = V1;
12351 } else if ((!Base || (Base == V2)) &&
12352 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12353 Size + Hi)) {
12354 Base = V2;
12355 } else {
12356 continue;
12357 }
12358
12359 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12360 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12361 V1 = Base;
12362 V2 = Insert;
12363 return true;
12364 }
12365 }
12366
12367 return false;
12368}
12369
12370/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12372 SDValue V2, ArrayRef<int> Mask,
12373 const APInt &Zeroable, SelectionDAG &DAG) {
12374 uint64_t BitLen, BitIdx;
12375 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12376 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12377 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12378 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12379
12380 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12381 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12382 V2 ? V2 : DAG.getUNDEF(VT),
12383 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12384 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12385
12386 return SDValue();
12387}
12388
12389/// Lower a vector shuffle as an any/signed/zero extension.
12390///
12391/// Given a specific number of elements, element bit width, and extension
12392/// stride, produce either an extension based on the available
12393/// features of the subtarget. The extended elements are consecutive and
12394/// begin and can start from an offsetted element index in the input; to
12395/// avoid excess shuffling the offset must either being in the bottom lane
12396/// or at the start of a higher lane. All extended elements must be from
12397/// the same lane.
12399 int Scale, int Offset,
12400 unsigned ExtOpc, SDValue InputV,
12401 ArrayRef<int> Mask,
12402 const X86Subtarget &Subtarget,
12403 SelectionDAG &DAG) {
12404 assert(Scale > 1 && "Need a scale to extend.");
12405 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12406 int EltBits = VT.getScalarSizeInBits();
12407 int NumElements = VT.getVectorNumElements();
12408 int NumEltsPerLane = 128 / EltBits;
12409 int OffsetLane = Offset / NumEltsPerLane;
12410 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12411 "Only 8, 16, and 32 bit elements can be extended.");
12412 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12413 assert(0 <= Offset && "Extension offset must be positive.");
12414 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12415 "Extension offset must be in the first lane or start an upper lane.");
12416
12417 // Check that an index is in same lane as the base offset.
12418 auto SafeOffset = [&](int Idx) {
12419 return OffsetLane == (Idx / NumEltsPerLane);
12420 };
12421
12422 // Shift along an input so that the offset base moves to the first element.
12423 auto ShuffleOffset = [&](SDValue V) {
12424 if (!Offset)
12425 return V;
12426
12427 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12428 for (int i = 0; i * Scale < NumElements; ++i) {
12429 int SrcIdx = i + Offset;
12430 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12431 }
12432 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12433 };
12434
12435 // Found a valid a/zext mask! Try various lowering strategies based on the
12436 // input type and available ISA extensions.
12437 if (Subtarget.hasSSE41()) {
12438 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12439 // PUNPCK will catch this in a later shuffle match.
12440 if (Offset && Scale == 2 && VT.is128BitVector())
12441 return SDValue();
12442 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12443 NumElements / Scale);
12444 InputV = DAG.getBitcast(VT, InputV);
12445 InputV = ShuffleOffset(InputV);
12446 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12447 return DAG.getBitcast(VT, InputV);
12448 }
12449
12450 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12451 InputV = DAG.getBitcast(VT, InputV);
12452 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12453
12454 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12455 if (ExtOpc == ISD::SIGN_EXTEND)
12456 return SDValue();
12457
12458 // For any extends we can cheat for larger element sizes and use shuffle
12459 // instructions that can fold with a load and/or copy.
12460 if (AnyExt && EltBits == 32) {
12461 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12462 -1};
12463 return DAG.getBitcast(
12464 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12465 DAG.getBitcast(MVT::v4i32, InputV),
12466 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12467 }
12468 if (AnyExt && EltBits == 16 && Scale > 2) {
12469 int PSHUFDMask[4] = {Offset / 2, -1,
12470 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12471 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12472 DAG.getBitcast(MVT::v4i32, InputV),
12473 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12474 int PSHUFWMask[4] = {1, -1, -1, -1};
12475 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12476 return DAG.getBitcast(
12477 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12478 DAG.getBitcast(MVT::v8i16, InputV),
12479 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12480 }
12481
12482 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12483 // to 64-bits.
12484 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12485 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12486 assert(VT.is128BitVector() && "Unexpected vector width!");
12487
12488 int LoIdx = Offset * EltBits;
12489 SDValue Lo = DAG.getBitcast(
12490 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12491 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12492 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12493
12494 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12495 return DAG.getBitcast(VT, Lo);
12496
12497 int HiIdx = (Offset + 1) * EltBits;
12498 SDValue Hi = DAG.getBitcast(
12499 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12500 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12501 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12502 return DAG.getBitcast(VT,
12503 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12504 }
12505
12506 // If this would require more than 2 unpack instructions to expand, use
12507 // pshufb when available. We can only use more than 2 unpack instructions
12508 // when zero extending i8 elements which also makes it easier to use pshufb.
12509 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12510 assert(NumElements == 16 && "Unexpected byte vector width!");
12511 SDValue PSHUFBMask[16];
12512 for (int i = 0; i < 16; ++i) {
12513 int Idx = Offset + (i / Scale);
12514 if ((i % Scale == 0 && SafeOffset(Idx))) {
12515 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12516 continue;
12517 }
12518 PSHUFBMask[i] =
12519 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12520 }
12521 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12522 return DAG.getBitcast(
12523 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12524 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12525 }
12526
12527 // If we are extending from an offset, ensure we start on a boundary that
12528 // we can unpack from.
12529 int AlignToUnpack = Offset % (NumElements / Scale);
12530 if (AlignToUnpack) {
12531 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12532 for (int i = AlignToUnpack; i < NumElements; ++i)
12533 ShMask[i - AlignToUnpack] = i;
12534 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12535 Offset -= AlignToUnpack;
12536 }
12537
12538 // Otherwise emit a sequence of unpacks.
12539 do {
12540 unsigned UnpackLoHi = X86ISD::UNPCKL;
12541 if (Offset >= (NumElements / 2)) {
12542 UnpackLoHi = X86ISD::UNPCKH;
12543 Offset -= (NumElements / 2);
12544 }
12545
12546 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12547 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12548 : getZeroVector(InputVT, Subtarget, DAG, DL);
12549 InputV = DAG.getBitcast(InputVT, InputV);
12550 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12551 Scale /= 2;
12552 EltBits *= 2;
12553 NumElements /= 2;
12554 } while (Scale > 1);
12555 return DAG.getBitcast(VT, InputV);
12556}
12557
12558/// Try to lower a vector shuffle as a zero extension on any microarch.
12559///
12560/// This routine will try to do everything in its power to cleverly lower
12561/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12562/// check for the profitability of this lowering, it tries to aggressively
12563/// match this pattern. It will use all of the micro-architectural details it
12564/// can to emit an efficient lowering. It handles both blends with all-zero
12565/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12566/// masking out later).
12567///
12568/// The reason we have dedicated lowering for zext-style shuffles is that they
12569/// are both incredibly common and often quite performance sensitive.
12571 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12572 const APInt &Zeroable, const X86Subtarget &Subtarget,
12573 SelectionDAG &DAG) {
12574 int Bits = VT.getSizeInBits();
12575 int NumLanes = Bits / 128;
12576 int NumElements = VT.getVectorNumElements();
12577 int NumEltsPerLane = NumElements / NumLanes;
12578 assert(VT.getScalarSizeInBits() <= 32 &&
12579 "Exceeds 32-bit integer zero extension limit");
12580 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12581
12582 // Define a helper function to check a particular ext-scale and lower to it if
12583 // valid.
12584 auto Lower = [&](int Scale) -> SDValue {
12585 SDValue InputV;
12586 bool AnyExt = true;
12587 int Offset = 0;
12588 int Matches = 0;
12589 for (int i = 0; i < NumElements; ++i) {
12590 int M = Mask[i];
12591 if (M < 0)
12592 continue; // Valid anywhere but doesn't tell us anything.
12593 if (i % Scale != 0) {
12594 // Each of the extended elements need to be zeroable.
12595 if (!Zeroable[i])
12596 return SDValue();
12597
12598 // We no longer are in the anyext case.
12599 AnyExt = false;
12600 continue;
12601 }
12602
12603 // Each of the base elements needs to be consecutive indices into the
12604 // same input vector.
12605 SDValue V = M < NumElements ? V1 : V2;
12606 M = M % NumElements;
12607 if (!InputV) {
12608 InputV = V;
12609 Offset = M - (i / Scale);
12610 } else if (InputV != V)
12611 return SDValue(); // Flip-flopping inputs.
12612
12613 // Offset must start in the lowest 128-bit lane or at the start of an
12614 // upper lane.
12615 // FIXME: Is it ever worth allowing a negative base offset?
12616 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12617 (Offset % NumEltsPerLane) == 0))
12618 return SDValue();
12619
12620 // If we are offsetting, all referenced entries must come from the same
12621 // lane.
12622 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12623 return SDValue();
12624
12625 if ((M % NumElements) != (Offset + (i / Scale)))
12626 return SDValue(); // Non-consecutive strided elements.
12627 Matches++;
12628 }
12629
12630 // If we fail to find an input, we have a zero-shuffle which should always
12631 // have already been handled.
12632 // FIXME: Maybe handle this here in case during blending we end up with one?
12633 if (!InputV)
12634 return SDValue();
12635
12636 // If we are offsetting, don't extend if we only match a single input, we
12637 // can always do better by using a basic PSHUF or PUNPCK.
12638 if (Offset != 0 && Matches < 2)
12639 return SDValue();
12640
12641 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12642 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12643 InputV, Mask, Subtarget, DAG);
12644 };
12645
12646 // The widest scale possible for extending is to a 64-bit integer.
12647 assert(Bits % 64 == 0 &&
12648 "The number of bits in a vector must be divisible by 64 on x86!");
12649 int NumExtElements = Bits / 64;
12650
12651 // Each iteration, try extending the elements half as much, but into twice as
12652 // many elements.
12653 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12654 assert(NumElements % NumExtElements == 0 &&
12655 "The input vector size must be divisible by the extended size.");
12656 if (SDValue V = Lower(NumElements / NumExtElements))
12657 return V;
12658 }
12659
12660 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12661 if (Bits != 128)
12662 return SDValue();
12663
12664 // Returns one of the source operands if the shuffle can be reduced to a
12665 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12666 auto CanZExtLowHalf = [&]() {
12667 for (int i = NumElements / 2; i != NumElements; ++i)
12668 if (!Zeroable[i])
12669 return SDValue();
12670 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12671 return V1;
12672 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12673 return V2;
12674 return SDValue();
12675 };
12676
12677 if (SDValue V = CanZExtLowHalf()) {
12678 V = DAG.getBitcast(MVT::v2i64, V);
12679 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12680 return DAG.getBitcast(VT, V);
12681 }
12682
12683 // No viable ext lowering found.
12684 return SDValue();
12685}
12686
12687/// Try to get a scalar value for a specific element of a vector.
12688///
12689/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12691 SelectionDAG &DAG) {
12692 MVT VT = V.getSimpleValueType();
12693 MVT EltVT = VT.getVectorElementType();
12694 V = peekThroughBitcasts(V);
12695
12696 // If the bitcasts shift the element size, we can't extract an equivalent
12697 // element from it.
12698 MVT NewVT = V.getSimpleValueType();
12699 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12700 return SDValue();
12701
12702 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12703 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12704 // Ensure the scalar operand is the same size as the destination.
12705 // FIXME: Add support for scalar truncation where possible.
12706 SDValue S = V.getOperand(Idx);
12707 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12708 return DAG.getBitcast(EltVT, S);
12709 }
12710
12711 return SDValue();
12712}
12713
12714/// Helper to test for a load that can be folded with x86 shuffles.
12715///
12716/// This is particularly important because the set of instructions varies
12717/// significantly based on whether the operand is a load or not.
12719 return V.hasOneUse() &&
12721}
12722
12723template<typename T>
12724static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12725 T EltVT = VT.getScalarType();
12726 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12727 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12728}
12729
12730/// Try to lower insertion of a single element into a zero vector.
12731///
12732/// This is a common pattern that we have especially efficient patterns to lower
12733/// across all subtarget feature sets.
12735 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12736 const APInt &Zeroable, const X86Subtarget &Subtarget,
12737 SelectionDAG &DAG) {
12738 MVT ExtVT = VT;
12739 MVT EltVT = VT.getVectorElementType();
12740 unsigned NumElts = VT.getVectorNumElements();
12741 unsigned EltBits = VT.getScalarSizeInBits();
12742
12743 if (isSoftF16(EltVT, Subtarget))
12744 return SDValue();
12745
12746 int V2Index =
12747 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12748 Mask.begin();
12749 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12750 bool IsV1Zeroable = true;
12751 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12752 if (i != V2Index && !Zeroable[i]) {
12753 IsV1Zeroable = false;
12754 break;
12755 }
12756
12757 // Bail if a non-zero V1 isn't used in place.
12758 if (!IsV1Zeroable) {
12759 SmallVector<int, 8> V1Mask(Mask);
12760 V1Mask[V2Index] = -1;
12761 if (!isNoopShuffleMask(V1Mask))
12762 return SDValue();
12763 }
12764
12765 // Check for a single input from a SCALAR_TO_VECTOR node.
12766 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12767 // all the smarts here sunk into that routine. However, the current
12768 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12769 // vector shuffle lowering is dead.
12770 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12771 DAG);
12772 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12773 // We need to zext the scalar if it is smaller than an i32.
12774 V2S = DAG.getBitcast(EltVT, V2S);
12775 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12776 // Using zext to expand a narrow element won't work for non-zero
12777 // insertions. But we can use a masked constant vector if we're
12778 // inserting V2 into the bottom of V1.
12779 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12780 return SDValue();
12781
12782 // Zero-extend directly to i32.
12783 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12784 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12785
12786 // If we're inserting into a constant, mask off the inserted index
12787 // and OR with the zero-extended scalar.
12788 if (!IsV1Zeroable) {
12789 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12790 Bits[V2Index] = APInt::getZero(EltBits);
12791 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12792 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12793 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12794 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12795 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12796 }
12797 }
12798 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12799 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12800 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12801 // Either not inserting from the low element of the input or the input
12802 // element size is too small to use VZEXT_MOVL to clear the high bits.
12803 return SDValue();
12804 }
12805
12806 if (!IsV1Zeroable) {
12807 // If V1 can't be treated as a zero vector we have fewer options to lower
12808 // this. We can't support integer vectors or non-zero targets cheaply.
12809 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12810 if (!VT.isFloatingPoint() || V2Index != 0)
12811 return SDValue();
12812 if (!VT.is128BitVector())
12813 return SDValue();
12814
12815 // Otherwise, use MOVSD, MOVSS or MOVSH.
12816 unsigned MovOpc = 0;
12817 if (EltVT == MVT::f16)
12818 MovOpc = X86ISD::MOVSH;
12819 else if (EltVT == MVT::f32)
12820 MovOpc = X86ISD::MOVSS;
12821 else if (EltVT == MVT::f64)
12822 MovOpc = X86ISD::MOVSD;
12823 else
12824 llvm_unreachable("Unsupported floating point element type to handle!");
12825 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12826 }
12827
12828 // This lowering only works for the low element with floating point vectors.
12829 if (VT.isFloatingPoint() && V2Index != 0)
12830 return SDValue();
12831
12832 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12833 if (ExtVT != VT)
12834 V2 = DAG.getBitcast(VT, V2);
12835
12836 if (V2Index != 0) {
12837 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12838 // the desired position. Otherwise it is more efficient to do a vector
12839 // shift left. We know that we can do a vector shift left because all
12840 // the inputs are zero.
12841 if (VT.isFloatingPoint() || NumElts <= 4) {
12842 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12843 V2Shuffle[V2Index] = 0;
12844 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12845 } else {
12846 V2 = DAG.getBitcast(MVT::v16i8, V2);
12847 V2 = DAG.getNode(
12848 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12849 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12850 V2 = DAG.getBitcast(VT, V2);
12851 }
12852 }
12853 return V2;
12854}
12855
12856/// Try to lower broadcast of a single - truncated - integer element,
12857/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12858///
12859/// This assumes we have AVX2.
12861 int BroadcastIdx,
12862 const X86Subtarget &Subtarget,
12863 SelectionDAG &DAG) {
12864 assert(Subtarget.hasAVX2() &&
12865 "We can only lower integer broadcasts with AVX2!");
12866
12867 MVT EltVT = VT.getVectorElementType();
12868 MVT V0VT = V0.getSimpleValueType();
12869
12870 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12871 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12872
12873 MVT V0EltVT = V0VT.getVectorElementType();
12874 if (!V0EltVT.isInteger())
12875 return SDValue();
12876
12877 const unsigned EltSize = EltVT.getSizeInBits();
12878 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12879
12880 // This is only a truncation if the original element type is larger.
12881 if (V0EltSize <= EltSize)
12882 return SDValue();
12883
12884 assert(((V0EltSize % EltSize) == 0) &&
12885 "Scalar type sizes must all be powers of 2 on x86!");
12886
12887 const unsigned V0Opc = V0.getOpcode();
12888 const unsigned Scale = V0EltSize / EltSize;
12889 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12890
12891 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12892 V0Opc != ISD::BUILD_VECTOR)
12893 return SDValue();
12894
12895 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12896
12897 // If we're extracting non-least-significant bits, shift so we can truncate.
12898 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12899 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12900 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12901 if (const int OffsetIdx = BroadcastIdx % Scale)
12902 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12903 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12904
12905 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12906 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12907}
12908
12909/// Test whether this can be lowered with a single SHUFPS instruction.
12910///
12911/// This is used to disable more specialized lowerings when the shufps lowering
12912/// will happen to be efficient.
12914 // This routine only handles 128-bit shufps.
12915 assert(Mask.size() == 4 && "Unsupported mask size!");
12916 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12917 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12918 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12919 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12920
12921 // To lower with a single SHUFPS we need to have the low half and high half
12922 // each requiring a single input.
12923 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12924 return false;
12925 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12926 return false;
12927
12928 return true;
12929}
12930
12931/// Test whether the specified input (0 or 1) is in-place blended by the
12932/// given mask.
12933///
12934/// This returns true if the elements from a particular input are already in the
12935/// slot required by the given mask and require no permutation.
12936static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12937 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12938 int Size = Mask.size();
12939 for (int i = 0; i < Size; ++i)
12940 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12941 return false;
12942
12943 return true;
12944}
12945
12946/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12947/// the given mask.
12948///
12950 int BroadcastableElement = 0) {
12951 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12952 int Size = Mask.size();
12953 for (int i = 0; i < Size; ++i)
12954 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12955 Mask[i] % Size != BroadcastableElement)
12956 return false;
12957 return true;
12958}
12959
12960/// If we are extracting two 128-bit halves of a vector and shuffling the
12961/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12962/// multi-shuffle lowering.
12964 SDValue N1, ArrayRef<int> Mask,
12965 SelectionDAG &DAG) {
12966 MVT VT = N0.getSimpleValueType();
12967 assert((VT.is128BitVector() &&
12968 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12969 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12970
12971 // Check that both sources are extracts of the same source vector.
12972 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12974 N0.getOperand(0) != N1.getOperand(0) ||
12975 !N0.hasOneUse() || !N1.hasOneUse())
12976 return SDValue();
12977
12978 SDValue WideVec = N0.getOperand(0);
12979 MVT WideVT = WideVec.getSimpleValueType();
12980 if (!WideVT.is256BitVector())
12981 return SDValue();
12982
12983 // Match extracts of each half of the wide source vector. Commute the shuffle
12984 // if the extract of the low half is N1.
12985 unsigned NumElts = VT.getVectorNumElements();
12986 SmallVector<int, 4> NewMask(Mask);
12987 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12988 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12989 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12991 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12992 return SDValue();
12993
12994 // Final bailout: if the mask is simple, we are better off using an extract
12995 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12996 // because that avoids a constant load from memory.
12997 if (NumElts == 4 &&
12998 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12999 return SDValue();
13000
13001 // Extend the shuffle mask with undef elements.
13002 NewMask.append(NumElts, -1);
13003
13004 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13005 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13006 NewMask);
13007 // This is free: ymm -> xmm.
13008 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13009 DAG.getVectorIdxConstant(0, DL));
13010}
13011
13012/// Try to lower broadcast of a single element.
13013///
13014/// For convenience, this code also bundles all of the subtarget feature set
13015/// filtering. While a little annoying to re-dispatch on type here, there isn't
13016/// a convenient way to factor it out.
13018 SDValue V2, ArrayRef<int> Mask,
13019 const X86Subtarget &Subtarget,
13020 SelectionDAG &DAG) {
13021 MVT EltVT = VT.getVectorElementType();
13022 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13023 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13024 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13025 return SDValue();
13026
13027 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13028 // we can only broadcast from a register with AVX2.
13029 unsigned NumEltBits = VT.getScalarSizeInBits();
13030 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13033 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13034
13035 // Check that the mask is a broadcast.
13036 int BroadcastIdx = getSplatIndex(Mask);
13037 if (BroadcastIdx < 0) {
13038 // Check for hidden broadcast.
13039 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13040 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13041 return SDValue();
13042 BroadcastIdx = 0;
13043 }
13044 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13045 "a sorted mask where the broadcast "
13046 "comes from V1.");
13047 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13048
13049 // Go up the chain of (vector) values to find a scalar load that we can
13050 // combine with the broadcast.
13051 // TODO: Combine this logic with findEltLoadSrc() used by
13052 // EltsFromConsecutiveLoads().
13053 int BitOffset = BroadcastIdx * NumEltBits;
13054 SDValue V = V1;
13055 for (;;) {
13056 switch (V.getOpcode()) {
13057 case ISD::BITCAST: {
13058 V = V.getOperand(0);
13059 continue;
13060 }
13061 case ISD::CONCAT_VECTORS: {
13062 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13063 int OpIdx = BitOffset / OpBitWidth;
13064 V = V.getOperand(OpIdx);
13065 BitOffset %= OpBitWidth;
13066 continue;
13067 }
13069 // The extraction index adds to the existing offset.
13070 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13071 unsigned Idx = V.getConstantOperandVal(1);
13072 unsigned BeginOffset = Idx * EltBitWidth;
13073 BitOffset += BeginOffset;
13074 V = V.getOperand(0);
13075 continue;
13076 }
13077 case ISD::INSERT_SUBVECTOR: {
13078 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13079 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13080 int Idx = (int)V.getConstantOperandVal(2);
13081 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13082 int BeginOffset = Idx * EltBitWidth;
13083 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13084 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13085 BitOffset -= BeginOffset;
13086 V = VInner;
13087 } else {
13088 V = VOuter;
13089 }
13090 continue;
13091 }
13092 }
13093 break;
13094 }
13095 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13096 BroadcastIdx = BitOffset / NumEltBits;
13097
13098 // Do we need to bitcast the source to retrieve the original broadcast index?
13099 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13100
13101 // Check if this is a broadcast of a scalar. We special case lowering
13102 // for scalars so that we can more effectively fold with loads.
13103 // If the original value has a larger element type than the shuffle, the
13104 // broadcast element is in essence truncated. Make that explicit to ease
13105 // folding.
13106 if (BitCastSrc && VT.isInteger())
13107 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13108 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13109 return TruncBroadcast;
13110
13111 // Also check the simpler case, where we can directly reuse the scalar.
13112 if (!BitCastSrc &&
13113 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13114 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13115 V = V.getOperand(BroadcastIdx);
13116
13117 // If we can't broadcast from a register, check that the input is a load.
13118 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13119 return SDValue();
13120 } else if (ISD::isNormalLoad(V.getNode()) &&
13121 cast<LoadSDNode>(V)->isSimple()) {
13122 // We do not check for one-use of the vector load because a broadcast load
13123 // is expected to be a win for code size, register pressure, and possibly
13124 // uops even if the original vector load is not eliminated.
13125
13126 // Reduce the vector load and shuffle to a broadcasted scalar load.
13127 auto *Ld = cast<LoadSDNode>(V);
13128 SDValue BaseAddr = Ld->getBasePtr();
13129 MVT SVT = VT.getScalarType();
13130 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13131 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13132 SDValue NewAddr =
13134
13135 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13136 // than MOVDDUP.
13137 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13138 if (Opcode == X86ISD::VBROADCAST) {
13139 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13140 SDValue Ops[] = {Ld->getChain(), NewAddr};
13141 V = DAG.getMemIntrinsicNode(
13142 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13144 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13146 return DAG.getBitcast(VT, V);
13147 }
13148 assert(SVT == MVT::f64 && "Unexpected VT!");
13149 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13151 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13153 } else if (!BroadcastFromReg) {
13154 // We can't broadcast from a vector register.
13155 return SDValue();
13156 } else if (BitOffset != 0) {
13157 // We can only broadcast from the zero-element of a vector register,
13158 // but it can be advantageous to broadcast from the zero-element of a
13159 // subvector.
13160 if (!VT.is256BitVector() && !VT.is512BitVector())
13161 return SDValue();
13162
13163 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13164 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13165 return SDValue();
13166
13167 // If we are broadcasting an element from the lowest 128-bit subvector, try
13168 // to move the element in position.
13169 if (BitOffset < 128 && NumActiveElts > 1 &&
13170 V.getScalarValueSizeInBits() == NumEltBits) {
13171 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13172 "Unexpected bit-offset");
13173 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13174 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13175 V = extractSubVector(V, 0, DAG, DL, 128);
13176 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13177 } else {
13178 // Only broadcast the zero-element of a 128-bit subvector.
13179 if ((BitOffset % 128) != 0)
13180 return SDValue();
13181
13182 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13183 "Unexpected bit-offset");
13184 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13185 "Unexpected vector size");
13186 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13187 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13188 }
13189 }
13190
13191 // On AVX we can use VBROADCAST directly for scalar sources.
13192 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13193 V = DAG.getBitcast(MVT::f64, V);
13194 if (Subtarget.hasAVX()) {
13195 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13196 return DAG.getBitcast(VT, V);
13197 }
13198 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13199 }
13200
13201 // If this is a scalar, do the broadcast on this type and bitcast.
13202 if (!V.getValueType().isVector()) {
13203 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13204 "Unexpected scalar size");
13205 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13207 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13208 }
13209
13210 // We only support broadcasting from 128-bit vectors to minimize the
13211 // number of patterns we need to deal with in isel. So extract down to
13212 // 128-bits, removing as many bitcasts as possible.
13213 if (V.getValueSizeInBits() > 128)
13215
13216 // Otherwise cast V to a vector with the same element type as VT, but
13217 // possibly narrower than VT. Then perform the broadcast.
13218 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13219 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13220 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13221}
13222
13223// Check for whether we can use INSERTPS to perform the shuffle. We only use
13224// INSERTPS when the V1 elements are already in the correct locations
13225// because otherwise we can just always use two SHUFPS instructions which
13226// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13227// perform INSERTPS if a single V1 element is out of place and all V2
13228// elements are zeroable.
13230 unsigned &InsertPSMask,
13231 const APInt &Zeroable,
13232 ArrayRef<int> Mask, SelectionDAG &DAG) {
13233 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13234 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13235 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13236
13237 // Attempt to match INSERTPS with one element from VA or VB being
13238 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13239 // are updated.
13240 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13241 ArrayRef<int> CandidateMask) {
13242 unsigned ZMask = 0;
13243 int VADstIndex = -1;
13244 int VBDstIndex = -1;
13245 bool VAUsedInPlace = false;
13246
13247 for (int i = 0; i < 4; ++i) {
13248 // Synthesize a zero mask from the zeroable elements (includes undefs).
13249 if (Zeroable[i]) {
13250 ZMask |= 1 << i;
13251 continue;
13252 }
13253
13254 // Flag if we use any VA inputs in place.
13255 if (i == CandidateMask[i]) {
13256 VAUsedInPlace = true;
13257 continue;
13258 }
13259
13260 // We can only insert a single non-zeroable element.
13261 if (VADstIndex >= 0 || VBDstIndex >= 0)
13262 return false;
13263
13264 if (CandidateMask[i] < 4) {
13265 // VA input out of place for insertion.
13266 VADstIndex = i;
13267 } else {
13268 // VB input for insertion.
13269 VBDstIndex = i;
13270 }
13271 }
13272
13273 // Don't bother if we have no (non-zeroable) element for insertion.
13274 if (VADstIndex < 0 && VBDstIndex < 0)
13275 return false;
13276
13277 // Determine element insertion src/dst indices. The src index is from the
13278 // start of the inserted vector, not the start of the concatenated vector.
13279 unsigned VBSrcIndex = 0;
13280 if (VADstIndex >= 0) {
13281 // If we have a VA input out of place, we use VA as the V2 element
13282 // insertion and don't use the original V2 at all.
13283 VBSrcIndex = CandidateMask[VADstIndex];
13284 VBDstIndex = VADstIndex;
13285 VB = VA;
13286 } else {
13287 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13288 }
13289
13290 // If no V1 inputs are used in place, then the result is created only from
13291 // the zero mask and the V2 insertion - so remove V1 dependency.
13292 if (!VAUsedInPlace)
13293 VA = DAG.getUNDEF(MVT::v4f32);
13294
13295 // Update V1, V2 and InsertPSMask accordingly.
13296 V1 = VA;
13297 V2 = VB;
13298
13299 // Insert the V2 element into the desired position.
13300 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13301 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13302 return true;
13303 };
13304
13305 if (matchAsInsertPS(V1, V2, Mask))
13306 return true;
13307
13308 // Commute and try again.
13309 SmallVector<int, 4> CommutedMask(Mask);
13311 if (matchAsInsertPS(V2, V1, CommutedMask))
13312 return true;
13313
13314 return false;
13315}
13316
13318 ArrayRef<int> Mask, const APInt &Zeroable,
13319 SelectionDAG &DAG) {
13320 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13321 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13322
13323 // Attempt to match the insertps pattern.
13324 unsigned InsertPSMask = 0;
13325 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13326 return SDValue();
13327
13328 // Insert the V2 element into the desired position.
13329 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13330 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13331}
13332
13333/// Handle lowering of 2-lane 64-bit floating point shuffles.
13334///
13335/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13336/// support for floating point shuffles but not integer shuffles. These
13337/// instructions will incur a domain crossing penalty on some chips though so
13338/// it is better to avoid lowering through this for integer vectors where
13339/// possible.
13341 const APInt &Zeroable, SDValue V1, SDValue V2,
13342 const X86Subtarget &Subtarget,
13343 SelectionDAG &DAG) {
13344 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13345 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13346 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13347
13348 if (V2.isUndef()) {
13349 // Check for being able to broadcast a single element.
13350 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13351 Mask, Subtarget, DAG))
13352 return Broadcast;
13353
13354 // Straight shuffle of a single input vector. Simulate this by using the
13355 // single input as both of the "inputs" to this instruction..
13356 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13357
13358 if (Subtarget.hasAVX()) {
13359 // If we have AVX, we can use VPERMILPS which will allow folding a load
13360 // into the shuffle.
13361 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13362 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13363 }
13364
13365 return DAG.getNode(
13366 X86ISD::SHUFP, DL, MVT::v2f64,
13367 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13368 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13369 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13370 }
13371 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13372 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13373 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13374 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13375
13376 if (Subtarget.hasAVX2())
13377 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13378 return Extract;
13379
13380 // When loading a scalar and then shuffling it into a vector we can often do
13381 // the insertion cheaply.
13383 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13384 return Insertion;
13385 // Try inverting the insertion since for v2 masks it is easy to do and we
13386 // can't reliably sort the mask one way or the other.
13387 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13388 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13390 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13391 return Insertion;
13392
13393 // Try to use one of the special instruction patterns to handle two common
13394 // blend patterns if a zero-blend above didn't work.
13395 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13396 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13397 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13398 // We can either use a special instruction to load over the low double or
13399 // to move just the low double.
13400 return DAG.getNode(
13401 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13402 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13403
13404 if (Subtarget.hasSSE41())
13405 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13406 Zeroable, Subtarget, DAG))
13407 return Blend;
13408
13409 // Use dedicated unpack instructions for masks that match their pattern.
13410 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13411 return V;
13412
13413 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13414 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13415 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13416}
13417
13418/// Handle lowering of 2-lane 64-bit integer shuffles.
13419///
13420/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13421/// the integer unit to minimize domain crossing penalties. However, for blends
13422/// it falls back to the floating point shuffle operation with appropriate bit
13423/// casting.
13425 const APInt &Zeroable, SDValue V1, SDValue V2,
13426 const X86Subtarget &Subtarget,
13427 SelectionDAG &DAG) {
13428 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13429 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13430 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13431
13432 if (V2.isUndef()) {
13433 // Check for being able to broadcast a single element.
13434 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13435 Mask, Subtarget, DAG))
13436 return Broadcast;
13437
13438 // Straight shuffle of a single input vector. For everything from SSE2
13439 // onward this has a single fast instruction with no scary immediates.
13440 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13441 V1 = DAG.getBitcast(MVT::v4i32, V1);
13442 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13443 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13444 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13445 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13446 return DAG.getBitcast(
13447 MVT::v2i64,
13448 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13449 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13450 }
13451 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13452 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13453 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13454 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13455
13456 if (Subtarget.hasAVX2())
13457 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13458 return Extract;
13459
13460 // Try to use shift instructions.
13461 if (SDValue Shift =
13462 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13463 DAG, /*BitwiseOnly*/ false))
13464 return Shift;
13465
13466 // When loading a scalar and then shuffling it into a vector we can often do
13467 // the insertion cheaply.
13469 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13470 return Insertion;
13471 // Try inverting the insertion since for v2 masks it is easy to do and we
13472 // can't reliably sort the mask one way or the other.
13473 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13475 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13476 return Insertion;
13477
13478 // We have different paths for blend lowering, but they all must use the
13479 // *exact* same predicate.
13480 bool IsBlendSupported = Subtarget.hasSSE41();
13481 if (IsBlendSupported)
13482 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13483 Zeroable, Subtarget, DAG))
13484 return Blend;
13485
13486 // Use dedicated unpack instructions for masks that match their pattern.
13487 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13488 return V;
13489
13490 // Try to use byte rotation instructions.
13491 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13492 if (Subtarget.hasSSSE3()) {
13493 if (Subtarget.hasVLX())
13494 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13495 Zeroable, Subtarget, DAG))
13496 return Rotate;
13497
13498 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13499 Subtarget, DAG))
13500 return Rotate;
13501 }
13502
13503 // If we have direct support for blends, we should lower by decomposing into
13504 // a permute. That will be faster than the domain cross.
13505 if (IsBlendSupported)
13506 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13507 Zeroable, Subtarget, DAG);
13508
13509 // We implement this with SHUFPD which is pretty lame because it will likely
13510 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13511 // However, all the alternatives are still more cycles and newer chips don't
13512 // have this problem. It would be really nice if x86 had better shuffles here.
13513 V1 = DAG.getBitcast(MVT::v2f64, V1);
13514 V2 = DAG.getBitcast(MVT::v2f64, V2);
13515 return DAG.getBitcast(MVT::v2i64,
13516 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13517}
13518
13519/// Lower a vector shuffle using the SHUFPS instruction.
13520///
13521/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13522/// It makes no assumptions about whether this is the *best* lowering, it simply
13523/// uses it.
13525 ArrayRef<int> Mask, SDValue V1,
13526 SDValue V2, SelectionDAG &DAG) {
13527 SDValue LowV = V1, HighV = V2;
13528 SmallVector<int, 4> NewMask(Mask);
13529 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13530
13531 if (NumV2Elements == 1) {
13532 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13533
13534 // Compute the index adjacent to V2Index and in the same half by toggling
13535 // the low bit.
13536 int V2AdjIndex = V2Index ^ 1;
13537
13538 if (Mask[V2AdjIndex] < 0) {
13539 // Handles all the cases where we have a single V2 element and an undef.
13540 // This will only ever happen in the high lanes because we commute the
13541 // vector otherwise.
13542 if (V2Index < 2)
13543 std::swap(LowV, HighV);
13544 NewMask[V2Index] -= 4;
13545 } else {
13546 // Handle the case where the V2 element ends up adjacent to a V1 element.
13547 // To make this work, blend them together as the first step.
13548 int V1Index = V2AdjIndex;
13549 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13550 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13551 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13552
13553 // Now proceed to reconstruct the final blend as we have the necessary
13554 // high or low half formed.
13555 if (V2Index < 2) {
13556 LowV = V2;
13557 HighV = V1;
13558 } else {
13559 HighV = V2;
13560 }
13561 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13562 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13563 }
13564 } else if (NumV2Elements == 2) {
13565 if (Mask[0] < 4 && Mask[1] < 4) {
13566 // Handle the easy case where we have V1 in the low lanes and V2 in the
13567 // high lanes.
13568 NewMask[2] -= 4;
13569 NewMask[3] -= 4;
13570 } else if (Mask[2] < 4 && Mask[3] < 4) {
13571 // We also handle the reversed case because this utility may get called
13572 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13573 // arrange things in the right direction.
13574 NewMask[0] -= 4;
13575 NewMask[1] -= 4;
13576 HighV = V1;
13577 LowV = V2;
13578 } else {
13579 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13580 // trying to place elements directly, just blend them and set up the final
13581 // shuffle to place them.
13582
13583 // The first two blend mask elements are for V1, the second two are for
13584 // V2.
13585 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13586 Mask[2] < 4 ? Mask[2] : Mask[3],
13587 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13588 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13589 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13590 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13591
13592 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13593 // a blend.
13594 LowV = HighV = V1;
13595 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13596 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13597 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13598 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13599 }
13600 } else if (NumV2Elements == 3) {
13601 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13602 // we can get here due to other paths (e.g repeated mask matching) that we
13603 // don't want to do another round of lowerVECTOR_SHUFFLE.
13605 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13606 }
13607 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13608 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13609}
13610
13611/// Lower 4-lane 32-bit floating point shuffles.
13612///
13613/// Uses instructions exclusively from the floating point unit to minimize
13614/// domain crossing penalties, as these are sufficient to implement all v4f32
13615/// shuffles.
13617 const APInt &Zeroable, SDValue V1, SDValue V2,
13618 const X86Subtarget &Subtarget,
13619 SelectionDAG &DAG) {
13620 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13621 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13622 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13623
13624 if (Subtarget.hasSSE41())
13625 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13626 Zeroable, Subtarget, DAG))
13627 return Blend;
13628
13629 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13630
13631 if (NumV2Elements == 0) {
13632 // Check for being able to broadcast a single element.
13633 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13634 Mask, Subtarget, DAG))
13635 return Broadcast;
13636
13637 // Use even/odd duplicate instructions for masks that match their pattern.
13638 if (Subtarget.hasSSE3()) {
13639 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13640 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13641 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13642 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13643 }
13644
13645 if (Subtarget.hasAVX()) {
13646 // If we have AVX, we can use VPERMILPS which will allow folding a load
13647 // into the shuffle.
13648 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13649 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13650 }
13651
13652 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13653 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13654 if (!Subtarget.hasSSE2()) {
13655 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13656 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13657 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13658 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13659 }
13660
13661 // Otherwise, use a straight shuffle of a single input vector. We pass the
13662 // input vector to both operands to simulate this with a SHUFPS.
13663 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13664 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13665 }
13666
13667 if (Subtarget.hasSSE2())
13669 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13670 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13671 return ZExt;
13672 }
13673
13674 if (Subtarget.hasAVX2())
13675 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13676 return Extract;
13677
13678 // There are special ways we can lower some single-element blends. However, we
13679 // have custom ways we can lower more complex single-element blends below that
13680 // we defer to if both this and BLENDPS fail to match, so restrict this to
13681 // when the V2 input is targeting element 0 of the mask -- that is the fast
13682 // case here.
13683 if (NumV2Elements == 1 && Mask[0] >= 4)
13685 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13686 return V;
13687
13688 if (Subtarget.hasSSE41()) {
13689 // Use INSERTPS if we can complete the shuffle efficiently.
13690 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13691 return V;
13692
13693 if (!isSingleSHUFPSMask(Mask))
13694 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13695 V2, Mask, DAG))
13696 return BlendPerm;
13697 }
13698
13699 // Use low/high mov instructions. These are only valid in SSE1 because
13700 // otherwise they are widened to v2f64 and never get here.
13701 if (!Subtarget.hasSSE2()) {
13702 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13703 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13704 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13705 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13706 }
13707
13708 // Use dedicated unpack instructions for masks that match their pattern.
13709 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13710 return V;
13711
13712 // Otherwise fall back to a SHUFPS lowering strategy.
13713 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13714}
13715
13716/// Lower 4-lane i32 vector shuffles.
13717///
13718/// We try to handle these with integer-domain shuffles where we can, but for
13719/// blends we use the floating point domain blend instructions.
13721 const APInt &Zeroable, SDValue V1, SDValue V2,
13722 const X86Subtarget &Subtarget,
13723 SelectionDAG &DAG) {
13724 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13725 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13726 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13727
13728 // Whenever we can lower this as a zext, that instruction is strictly faster
13729 // than any alternative. It also allows us to fold memory operands into the
13730 // shuffle in many cases.
13731 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13732 Zeroable, Subtarget, DAG))
13733 return ZExt;
13734
13735 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13736
13737 // Try to use shift instructions if fast.
13738 if (Subtarget.preferLowerShuffleAsShift()) {
13739 if (SDValue Shift =
13740 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13741 Subtarget, DAG, /*BitwiseOnly*/ true))
13742 return Shift;
13743 if (NumV2Elements == 0)
13744 if (SDValue Rotate =
13745 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13746 return Rotate;
13747 }
13748
13749 if (NumV2Elements == 0) {
13750 // Try to use broadcast unless the mask only has one non-undef element.
13751 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13752 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13753 Mask, Subtarget, DAG))
13754 return Broadcast;
13755 }
13756
13757 // Straight shuffle of a single input vector. For everything from SSE2
13758 // onward this has a single fast instruction with no scary immediates.
13759 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13760 // but we aren't actually going to use the UNPCK instruction because doing
13761 // so prevents folding a load into this instruction or making a copy.
13762 const int UnpackLoMask[] = {0, 0, 1, 1};
13763 const int UnpackHiMask[] = {2, 2, 3, 3};
13764 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13765 Mask = UnpackLoMask;
13766 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13767 Mask = UnpackHiMask;
13768
13769 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13770 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13771 }
13772
13773 if (Subtarget.hasAVX2())
13774 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13775 return Extract;
13776
13777 // Try to use shift instructions.
13778 if (SDValue Shift =
13779 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13780 DAG, /*BitwiseOnly*/ false))
13781 return Shift;
13782
13783 // There are special ways we can lower some single-element blends.
13784 if (NumV2Elements == 1)
13786 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13787 return V;
13788
13789 // We have different paths for blend lowering, but they all must use the
13790 // *exact* same predicate.
13791 bool IsBlendSupported = Subtarget.hasSSE41();
13792 if (IsBlendSupported)
13793 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13794 Zeroable, Subtarget, DAG))
13795 return Blend;
13796
13797 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13798 Zeroable, Subtarget, DAG))
13799 return Masked;
13800
13801 // Use dedicated unpack instructions for masks that match their pattern.
13802 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13803 return V;
13804
13805 // Try to use byte rotation instructions.
13806 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13807 if (Subtarget.hasSSSE3()) {
13808 if (Subtarget.hasVLX())
13809 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13810 Zeroable, Subtarget, DAG))
13811 return Rotate;
13812
13813 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13814 Subtarget, DAG))
13815 return Rotate;
13816 }
13817
13818 // Assume that a single SHUFPS is faster than an alternative sequence of
13819 // multiple instructions (even if the CPU has a domain penalty).
13820 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13821 if (!isSingleSHUFPSMask(Mask)) {
13822 // If we have direct support for blends, we should lower by decomposing into
13823 // a permute. That will be faster than the domain cross.
13824 if (IsBlendSupported)
13825 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13826 Zeroable, Subtarget, DAG);
13827
13828 // Try to lower by permuting the inputs into an unpack instruction.
13829 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13830 Mask, Subtarget, DAG))
13831 return Unpack;
13832 }
13833
13834 // We implement this with SHUFPS because it can blend from two vectors.
13835 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13836 // up the inputs, bypassing domain shift penalties that we would incur if we
13837 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13838 // relevant.
13839 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13840 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13841 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13842 return DAG.getBitcast(MVT::v4i32, ShufPS);
13843}
13844
13845/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13846/// shuffle lowering, and the most complex part.
13847///
13848/// The lowering strategy is to try to form pairs of input lanes which are
13849/// targeted at the same half of the final vector, and then use a dword shuffle
13850/// to place them onto the right half, and finally unpack the paired lanes into
13851/// their final position.
13852///
13853/// The exact breakdown of how to form these dword pairs and align them on the
13854/// correct sides is really tricky. See the comments within the function for
13855/// more of the details.
13856///
13857/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13858/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13859/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13860/// vector, form the analogous 128-bit 8-element Mask.
13862 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13863 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13864 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13865 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13866
13867 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13868 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13869 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13870
13871 // Attempt to directly match PSHUFLW or PSHUFHW.
13872 if (isUndefOrInRange(LoMask, 0, 4) &&
13873 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13874 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13875 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13876 }
13877 if (isUndefOrInRange(HiMask, 4, 8) &&
13878 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13879 for (int i = 0; i != 4; ++i)
13880 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13881 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13882 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13883 }
13884
13885 SmallVector<int, 4> LoInputs;
13886 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13887 array_pod_sort(LoInputs.begin(), LoInputs.end());
13888 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13889 SmallVector<int, 4> HiInputs;
13890 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13891 array_pod_sort(HiInputs.begin(), HiInputs.end());
13892 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13893 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13894 int NumHToL = LoInputs.size() - NumLToL;
13895 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13896 int NumHToH = HiInputs.size() - NumLToH;
13897 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13898 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13899 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13900 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13901
13902 // If we are shuffling values from one half - check how many different DWORD
13903 // pairs we need to create. If only 1 or 2 then we can perform this as a
13904 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13905 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13906 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13907 V = DAG.getNode(ShufWOp, DL, VT, V,
13908 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13909 V = DAG.getBitcast(PSHUFDVT, V);
13910 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13911 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13912 return DAG.getBitcast(VT, V);
13913 };
13914
13915 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13916 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13917 SmallVector<std::pair<int, int>, 4> DWordPairs;
13918 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13919
13920 // Collect the different DWORD pairs.
13921 for (int DWord = 0; DWord != 4; ++DWord) {
13922 int M0 = Mask[2 * DWord + 0];
13923 int M1 = Mask[2 * DWord + 1];
13924 M0 = (M0 >= 0 ? M0 % 4 : M0);
13925 M1 = (M1 >= 0 ? M1 % 4 : M1);
13926 if (M0 < 0 && M1 < 0)
13927 continue;
13928
13929 bool Match = false;
13930 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13931 auto &DWordPair = DWordPairs[j];
13932 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13933 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13934 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13935 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13936 PSHUFDMask[DWord] = DOffset + j;
13937 Match = true;
13938 break;
13939 }
13940 }
13941 if (!Match) {
13942 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13943 DWordPairs.push_back(std::make_pair(M0, M1));
13944 }
13945 }
13946
13947 if (DWordPairs.size() <= 2) {
13948 DWordPairs.resize(2, std::make_pair(-1, -1));
13949 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13950 DWordPairs[1].first, DWordPairs[1].second};
13951 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13952 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13953 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13954 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13955 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13956 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13957 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13958 }
13959 if ((NumHToL + NumHToH) == 0)
13960 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13961 if ((NumLToL + NumLToH) == 0)
13962 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13963 }
13964 }
13965
13966 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13967 // such inputs we can swap two of the dwords across the half mark and end up
13968 // with <=2 inputs to each half in each half. Once there, we can fall through
13969 // to the generic code below. For example:
13970 //
13971 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13972 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13973 //
13974 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13975 // and an existing 2-into-2 on the other half. In this case we may have to
13976 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13977 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13978 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13979 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13980 // half than the one we target for fixing) will be fixed when we re-enter this
13981 // path. We will also combine away any sequence of PSHUFD instructions that
13982 // result into a single instruction. Here is an example of the tricky case:
13983 //
13984 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13985 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13986 //
13987 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13988 //
13989 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13990 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13991 //
13992 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13993 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13994 //
13995 // The result is fine to be handled by the generic logic.
13996 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13997 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13998 int AOffset, int BOffset) {
13999 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14000 "Must call this with A having 3 or 1 inputs from the A half.");
14001 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14002 "Must call this with B having 1 or 3 inputs from the B half.");
14003 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14004 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14005
14006 bool ThreeAInputs = AToAInputs.size() == 3;
14007
14008 // Compute the index of dword with only one word among the three inputs in
14009 // a half by taking the sum of the half with three inputs and subtracting
14010 // the sum of the actual three inputs. The difference is the remaining
14011 // slot.
14012 int ADWord = 0, BDWord = 0;
14013 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14014 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14015 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14016 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14017 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14018 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14019 int TripleNonInputIdx =
14020 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14021 TripleDWord = TripleNonInputIdx / 2;
14022
14023 // We use xor with one to compute the adjacent DWord to whichever one the
14024 // OneInput is in.
14025 OneInputDWord = (OneInput / 2) ^ 1;
14026
14027 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14028 // and BToA inputs. If there is also such a problem with the BToB and AToB
14029 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14030 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14031 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14032 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14033 // Compute how many inputs will be flipped by swapping these DWords. We
14034 // need
14035 // to balance this to ensure we don't form a 3-1 shuffle in the other
14036 // half.
14037 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14038 llvm::count(AToBInputs, 2 * ADWord + 1);
14039 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14040 llvm::count(BToBInputs, 2 * BDWord + 1);
14041 if ((NumFlippedAToBInputs == 1 &&
14042 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14043 (NumFlippedBToBInputs == 1 &&
14044 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14045 // We choose whether to fix the A half or B half based on whether that
14046 // half has zero flipped inputs. At zero, we may not be able to fix it
14047 // with that half. We also bias towards fixing the B half because that
14048 // will more commonly be the high half, and we have to bias one way.
14049 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14050 ArrayRef<int> Inputs) {
14051 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14052 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14053 // Determine whether the free index is in the flipped dword or the
14054 // unflipped dword based on where the pinned index is. We use this bit
14055 // in an xor to conditionally select the adjacent dword.
14056 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14057 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14058 if (IsFixIdxInput == IsFixFreeIdxInput)
14059 FixFreeIdx += 1;
14060 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14061 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14062 "We need to be changing the number of flipped inputs!");
14063 int PSHUFHalfMask[] = {0, 1, 2, 3};
14064 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14065 V = DAG.getNode(
14066 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14067 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14068 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14069
14070 for (int &M : Mask)
14071 if (M >= 0 && M == FixIdx)
14072 M = FixFreeIdx;
14073 else if (M >= 0 && M == FixFreeIdx)
14074 M = FixIdx;
14075 };
14076 if (NumFlippedBToBInputs != 0) {
14077 int BPinnedIdx =
14078 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14079 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14080 } else {
14081 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14082 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14083 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14084 }
14085 }
14086 }
14087
14088 int PSHUFDMask[] = {0, 1, 2, 3};
14089 PSHUFDMask[ADWord] = BDWord;
14090 PSHUFDMask[BDWord] = ADWord;
14091 V = DAG.getBitcast(
14092 VT,
14093 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14094 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14095
14096 // Adjust the mask to match the new locations of A and B.
14097 for (int &M : Mask)
14098 if (M >= 0 && M/2 == ADWord)
14099 M = 2 * BDWord + M % 2;
14100 else if (M >= 0 && M/2 == BDWord)
14101 M = 2 * ADWord + M % 2;
14102
14103 // Recurse back into this routine to re-compute state now that this isn't
14104 // a 3 and 1 problem.
14105 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14106 };
14107 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14108 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14109 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14110 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14111
14112 // At this point there are at most two inputs to the low and high halves from
14113 // each half. That means the inputs can always be grouped into dwords and
14114 // those dwords can then be moved to the correct half with a dword shuffle.
14115 // We use at most one low and one high word shuffle to collect these paired
14116 // inputs into dwords, and finally a dword shuffle to place them.
14117 int PSHUFLMask[4] = {-1, -1, -1, -1};
14118 int PSHUFHMask[4] = {-1, -1, -1, -1};
14119 int PSHUFDMask[4] = {-1, -1, -1, -1};
14120
14121 // First fix the masks for all the inputs that are staying in their
14122 // original halves. This will then dictate the targets of the cross-half
14123 // shuffles.
14124 auto fixInPlaceInputs =
14125 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14126 MutableArrayRef<int> SourceHalfMask,
14127 MutableArrayRef<int> HalfMask, int HalfOffset) {
14128 if (InPlaceInputs.empty())
14129 return;
14130 if (InPlaceInputs.size() == 1) {
14131 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14132 InPlaceInputs[0] - HalfOffset;
14133 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14134 return;
14135 }
14136 if (IncomingInputs.empty()) {
14137 // Just fix all of the in place inputs.
14138 for (int Input : InPlaceInputs) {
14139 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14140 PSHUFDMask[Input / 2] = Input / 2;
14141 }
14142 return;
14143 }
14144
14145 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14146 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14147 InPlaceInputs[0] - HalfOffset;
14148 // Put the second input next to the first so that they are packed into
14149 // a dword. We find the adjacent index by toggling the low bit.
14150 int AdjIndex = InPlaceInputs[0] ^ 1;
14151 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14152 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14153 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14154 };
14155 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14156 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14157
14158 // Now gather the cross-half inputs and place them into a free dword of
14159 // their target half.
14160 // FIXME: This operation could almost certainly be simplified dramatically to
14161 // look more like the 3-1 fixing operation.
14162 auto moveInputsToRightHalf = [&PSHUFDMask](
14163 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14164 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14165 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14166 int DestOffset) {
14167 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14168 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14169 };
14170 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14171 int Word) {
14172 int LowWord = Word & ~1;
14173 int HighWord = Word | 1;
14174 return isWordClobbered(SourceHalfMask, LowWord) ||
14175 isWordClobbered(SourceHalfMask, HighWord);
14176 };
14177
14178 if (IncomingInputs.empty())
14179 return;
14180
14181 if (ExistingInputs.empty()) {
14182 // Map any dwords with inputs from them into the right half.
14183 for (int Input : IncomingInputs) {
14184 // If the source half mask maps over the inputs, turn those into
14185 // swaps and use the swapped lane.
14186 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14187 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14188 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14189 Input - SourceOffset;
14190 // We have to swap the uses in our half mask in one sweep.
14191 for (int &M : HalfMask)
14192 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14193 M = Input;
14194 else if (M == Input)
14195 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14196 } else {
14197 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14198 Input - SourceOffset &&
14199 "Previous placement doesn't match!");
14200 }
14201 // Note that this correctly re-maps both when we do a swap and when
14202 // we observe the other side of the swap above. We rely on that to
14203 // avoid swapping the members of the input list directly.
14204 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14205 }
14206
14207 // Map the input's dword into the correct half.
14208 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14209 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14210 else
14211 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14212 Input / 2 &&
14213 "Previous placement doesn't match!");
14214 }
14215
14216 // And just directly shift any other-half mask elements to be same-half
14217 // as we will have mirrored the dword containing the element into the
14218 // same position within that half.
14219 for (int &M : HalfMask)
14220 if (M >= SourceOffset && M < SourceOffset + 4) {
14221 M = M - SourceOffset + DestOffset;
14222 assert(M >= 0 && "This should never wrap below zero!");
14223 }
14224 return;
14225 }
14226
14227 // Ensure we have the input in a viable dword of its current half. This
14228 // is particularly tricky because the original position may be clobbered
14229 // by inputs being moved and *staying* in that half.
14230 if (IncomingInputs.size() == 1) {
14231 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14232 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14233 SourceOffset;
14234 SourceHalfMask[InputFixed - SourceOffset] =
14235 IncomingInputs[0] - SourceOffset;
14236 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14237 IncomingInputs[0] = InputFixed;
14238 }
14239 } else if (IncomingInputs.size() == 2) {
14240 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14241 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14242 // We have two non-adjacent or clobbered inputs we need to extract from
14243 // the source half. To do this, we need to map them into some adjacent
14244 // dword slot in the source mask.
14245 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14246 IncomingInputs[1] - SourceOffset};
14247
14248 // If there is a free slot in the source half mask adjacent to one of
14249 // the inputs, place the other input in it. We use (Index XOR 1) to
14250 // compute an adjacent index.
14251 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14252 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14253 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14254 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14255 InputsFixed[1] = InputsFixed[0] ^ 1;
14256 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14257 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14258 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14259 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14260 InputsFixed[0] = InputsFixed[1] ^ 1;
14261 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14262 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14263 // The two inputs are in the same DWord but it is clobbered and the
14264 // adjacent DWord isn't used at all. Move both inputs to the free
14265 // slot.
14266 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14267 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14268 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14269 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14270 } else {
14271 // The only way we hit this point is if there is no clobbering
14272 // (because there are no off-half inputs to this half) and there is no
14273 // free slot adjacent to one of the inputs. In this case, we have to
14274 // swap an input with a non-input.
14275 for (int i = 0; i < 4; ++i)
14276 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14277 "We can't handle any clobbers here!");
14278 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14279 "Cannot have adjacent inputs here!");
14280
14281 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14282 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14283
14284 // We also have to update the final source mask in this case because
14285 // it may need to undo the above swap.
14286 for (int &M : FinalSourceHalfMask)
14287 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14288 M = InputsFixed[1] + SourceOffset;
14289 else if (M == InputsFixed[1] + SourceOffset)
14290 M = (InputsFixed[0] ^ 1) + SourceOffset;
14291
14292 InputsFixed[1] = InputsFixed[0] ^ 1;
14293 }
14294
14295 // Point everything at the fixed inputs.
14296 for (int &M : HalfMask)
14297 if (M == IncomingInputs[0])
14298 M = InputsFixed[0] + SourceOffset;
14299 else if (M == IncomingInputs[1])
14300 M = InputsFixed[1] + SourceOffset;
14301
14302 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14303 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14304 }
14305 } else {
14306 llvm_unreachable("Unhandled input size!");
14307 }
14308
14309 // Now hoist the DWord down to the right half.
14310 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14311 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14312 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14313 for (int &M : HalfMask)
14314 for (int Input : IncomingInputs)
14315 if (M == Input)
14316 M = FreeDWord * 2 + Input % 2;
14317 };
14318 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14319 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14320 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14321 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14322
14323 // Now enact all the shuffles we've computed to move the inputs into their
14324 // target half.
14325 if (!isNoopShuffleMask(PSHUFLMask))
14326 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14327 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14328 if (!isNoopShuffleMask(PSHUFHMask))
14329 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14330 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14331 if (!isNoopShuffleMask(PSHUFDMask))
14332 V = DAG.getBitcast(
14333 VT,
14334 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14335 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14336
14337 // At this point, each half should contain all its inputs, and we can then
14338 // just shuffle them into their final position.
14339 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14340 "Failed to lift all the high half inputs to the low mask!");
14341 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14342 "Failed to lift all the low half inputs to the high mask!");
14343
14344 // Do a half shuffle for the low mask.
14345 if (!isNoopShuffleMask(LoMask))
14346 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14347 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14348
14349 // Do a half shuffle with the high mask after shifting its values down.
14350 for (int &M : HiMask)
14351 if (M >= 0)
14352 M -= 4;
14353 if (!isNoopShuffleMask(HiMask))
14354 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14355 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14356
14357 return V;
14358}
14359
14360/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14361/// blend if only one input is used.
14363 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14364 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14366 "Lane crossing shuffle masks not supported");
14367
14368 int NumBytes = VT.getSizeInBits() / 8;
14369 int Size = Mask.size();
14370 int Scale = NumBytes / Size;
14371
14372 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14373 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14374 V1InUse = false;
14375 V2InUse = false;
14376
14377 for (int i = 0; i < NumBytes; ++i) {
14378 int M = Mask[i / Scale];
14379 if (M < 0)
14380 continue;
14381
14382 const int ZeroMask = 0x80;
14383 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14384 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14385 if (Zeroable[i / Scale])
14386 V1Idx = V2Idx = ZeroMask;
14387
14388 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14389 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14390 V1InUse |= (ZeroMask != V1Idx);
14391 V2InUse |= (ZeroMask != V2Idx);
14392 }
14393
14394 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14395 if (V1InUse)
14396 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14397 DAG.getBuildVector(ShufVT, DL, V1Mask));
14398 if (V2InUse)
14399 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14400 DAG.getBuildVector(ShufVT, DL, V2Mask));
14401
14402 // If we need shuffled inputs from both, blend the two.
14403 SDValue V;
14404 if (V1InUse && V2InUse)
14405 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14406 else
14407 V = V1InUse ? V1 : V2;
14408
14409 // Cast the result back to the correct type.
14410 return DAG.getBitcast(VT, V);
14411}
14412
14413/// Generic lowering of 8-lane i16 shuffles.
14414///
14415/// This handles both single-input shuffles and combined shuffle/blends with
14416/// two inputs. The single input shuffles are immediately delegated to
14417/// a dedicated lowering routine.
14418///
14419/// The blends are lowered in one of three fundamental ways. If there are few
14420/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14421/// of the input is significantly cheaper when lowered as an interleaving of
14422/// the two inputs, try to interleave them. Otherwise, blend the low and high
14423/// halves of the inputs separately (making them have relatively few inputs)
14424/// and then concatenate them.
14426 const APInt &Zeroable, SDValue V1, SDValue V2,
14427 const X86Subtarget &Subtarget,
14428 SelectionDAG &DAG) {
14429 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14430 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14431 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14432
14433 // Whenever we can lower this as a zext, that instruction is strictly faster
14434 // than any alternative.
14435 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14436 Zeroable, Subtarget, DAG))
14437 return ZExt;
14438
14439 // Try to use lower using a truncation.
14440 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14441 Subtarget, DAG))
14442 return V;
14443
14444 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14445
14446 if (NumV2Inputs == 0) {
14447 // Try to use shift instructions.
14448 if (SDValue Shift =
14449 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14450 Subtarget, DAG, /*BitwiseOnly*/ false))
14451 return Shift;
14452
14453 // Check for being able to broadcast a single element.
14454 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14455 Mask, Subtarget, DAG))
14456 return Broadcast;
14457
14458 // Try to use bit rotation instructions.
14459 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14460 Subtarget, DAG))
14461 return Rotate;
14462
14463 // Use dedicated unpack instructions for masks that match their pattern.
14464 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14465 return V;
14466
14467 // Use dedicated pack instructions for masks that match their pattern.
14468 if (SDValue V =
14469 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14470 return V;
14471
14472 // Try to use byte rotation instructions.
14473 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14474 Subtarget, DAG))
14475 return Rotate;
14476
14477 // Make a copy of the mask so it can be modified.
14478 SmallVector<int, 8> MutableMask(Mask);
14479 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14480 Subtarget, DAG);
14481 }
14482
14483 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14484 "All single-input shuffles should be canonicalized to be V1-input "
14485 "shuffles.");
14486
14487 // Try to use shift instructions.
14488 if (SDValue Shift =
14489 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14490 DAG, /*BitwiseOnly*/ false))
14491 return Shift;
14492
14493 // See if we can use SSE4A Extraction / Insertion.
14494 if (Subtarget.hasSSE4A())
14495 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14496 Zeroable, DAG))
14497 return V;
14498
14499 // There are special ways we can lower some single-element blends.
14500 if (NumV2Inputs == 1)
14502 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14503 return V;
14504
14505 // We have different paths for blend lowering, but they all must use the
14506 // *exact* same predicate.
14507 bool IsBlendSupported = Subtarget.hasSSE41();
14508 if (IsBlendSupported)
14509 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14510 Zeroable, Subtarget, DAG))
14511 return Blend;
14512
14513 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14514 Zeroable, Subtarget, DAG))
14515 return Masked;
14516
14517 // Use dedicated unpack instructions for masks that match their pattern.
14518 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14519 return V;
14520
14521 // Use dedicated pack instructions for masks that match their pattern.
14522 if (SDValue V =
14523 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14524 return V;
14525
14526 // Try to use lower using a truncation.
14527 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14528 Subtarget, DAG))
14529 return V;
14530
14531 // Try to use byte rotation instructions.
14532 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14533 Subtarget, DAG))
14534 return Rotate;
14535
14536 if (SDValue BitBlend =
14537 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14538 return BitBlend;
14539
14540 // Try to use byte shift instructions to mask.
14541 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14542 Zeroable, Subtarget, DAG))
14543 return V;
14544
14545 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14546 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14547 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14548 !Subtarget.hasVLX()) {
14549 // Check if this is part of a 256-bit vector truncation.
14550 unsigned PackOpc = 0;
14551 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14554 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14555 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14556 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14557 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14558 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14559 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14560 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14561 PackOpc = X86ISD::PACKUS;
14562 } else if (Subtarget.hasSSE41()) {
14563 SmallVector<SDValue, 4> DWordClearOps(4,
14564 DAG.getConstant(0, DL, MVT::i32));
14565 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14566 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14567 SDValue DWordClearMask =
14568 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14569 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14570 DWordClearMask);
14571 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14572 DWordClearMask);
14573 PackOpc = X86ISD::PACKUS;
14574 } else if (!Subtarget.hasSSSE3()) {
14575 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14576 V1 = DAG.getBitcast(MVT::v4i32, V1);
14577 V2 = DAG.getBitcast(MVT::v4i32, V2);
14578 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14579 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14580 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14581 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14582 PackOpc = X86ISD::PACKSS;
14583 }
14584 if (PackOpc) {
14585 // Now pack things back together.
14586 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14587 if (NumEvenDrops == 2) {
14588 Result = DAG.getBitcast(MVT::v4i32, Result);
14589 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14590 }
14591 return Result;
14592 }
14593 }
14594
14595 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14596 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14597 if (NumOddDrops == 1) {
14598 bool HasSSE41 = Subtarget.hasSSE41();
14599 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14600 DAG.getBitcast(MVT::v4i32, V1),
14601 DAG.getTargetConstant(16, DL, MVT::i8));
14602 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14603 DAG.getBitcast(MVT::v4i32, V2),
14604 DAG.getTargetConstant(16, DL, MVT::i8));
14605 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14606 MVT::v8i16, V1, V2);
14607 }
14608
14609 // Try to lower by permuting the inputs into an unpack instruction.
14610 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14611 Mask, Subtarget, DAG))
14612 return Unpack;
14613
14614 // If we can't directly blend but can use PSHUFB, that will be better as it
14615 // can both shuffle and set up the inefficient blend.
14616 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14617 bool V1InUse, V2InUse;
14618 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14619 Zeroable, DAG, V1InUse, V2InUse);
14620 }
14621
14622 // We can always bit-blend if we have to so the fallback strategy is to
14623 // decompose into single-input permutes and blends/unpacks.
14624 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14625 Zeroable, Subtarget, DAG);
14626}
14627
14628/// Lower 8-lane 16-bit floating point shuffles.
14630 const APInt &Zeroable, SDValue V1, SDValue V2,
14631 const X86Subtarget &Subtarget,
14632 SelectionDAG &DAG) {
14633 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14634 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14635 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14636 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14637
14638 if (Subtarget.hasFP16()) {
14639 if (NumV2Elements == 0) {
14640 // Check for being able to broadcast a single element.
14641 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14642 Mask, Subtarget, DAG))
14643 return Broadcast;
14644 }
14645 if (NumV2Elements == 1 && Mask[0] >= 8)
14647 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14648 return V;
14649 }
14650
14651 V1 = DAG.getBitcast(MVT::v8i16, V1);
14652 V2 = DAG.getBitcast(MVT::v8i16, V2);
14653 return DAG.getBitcast(MVT::v8f16,
14654 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14655}
14656
14657// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14658// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14659// the active subvector is extracted.
14661 ArrayRef<int> OriginalMask, SDValue V1,
14662 SDValue V2, const X86Subtarget &Subtarget,
14663 SelectionDAG &DAG) {
14664 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14665 SmallVector<int, 32> Mask(OriginalMask);
14666 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14667 !isShuffleFoldableLoad(V2)) {
14669 std::swap(V1, V2);
14670 }
14671
14672 MVT MaskVT = VT.changeTypeToInteger();
14673 SDValue MaskNode;
14674 MVT ShuffleVT = VT;
14675 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14676 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14677 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14678 ShuffleVT = V1.getSimpleValueType();
14679
14680 // Adjust mask to correct indices for the second input.
14681 int NumElts = VT.getVectorNumElements();
14682 unsigned Scale = 512 / VT.getSizeInBits();
14683 SmallVector<int, 32> AdjustedMask(Mask);
14684 for (int &M : AdjustedMask)
14685 if (NumElts <= M)
14686 M += (Scale - 1) * NumElts;
14687 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14688 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14689 } else {
14690 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14691 }
14692
14693 SDValue Result;
14694 if (V2.isUndef())
14695 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14696 else
14697 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14698
14699 if (VT != ShuffleVT)
14700 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14701
14702 return Result;
14703}
14704
14705/// Generic lowering of v16i8 shuffles.
14706///
14707/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14708/// detect any complexity reducing interleaving. If that doesn't help, it uses
14709/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14710/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14711/// back together.
14713 const APInt &Zeroable, SDValue V1, SDValue V2,
14714 const X86Subtarget &Subtarget,
14715 SelectionDAG &DAG) {
14716 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14717 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14718 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14719
14720 // Try to use shift instructions.
14721 if (SDValue Shift =
14722 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14723 DAG, /*BitwiseOnly*/ false))
14724 return Shift;
14725
14726 // Try to use byte rotation instructions.
14727 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14728 Subtarget, DAG))
14729 return Rotate;
14730
14731 // Use dedicated pack instructions for masks that match their pattern.
14732 if (SDValue V =
14733 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14734 return V;
14735
14736 // Try to use a zext lowering.
14737 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14738 Zeroable, Subtarget, DAG))
14739 return ZExt;
14740
14741 // Try to use lower using a truncation.
14742 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14743 Subtarget, DAG))
14744 return V;
14745
14746 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14747 Subtarget, DAG))
14748 return V;
14749
14750 // See if we can use SSE4A Extraction / Insertion.
14751 if (Subtarget.hasSSE4A())
14752 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14753 Zeroable, DAG))
14754 return V;
14755
14756 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14757
14758 // For single-input shuffles, there are some nicer lowering tricks we can use.
14759 if (NumV2Elements == 0) {
14760 // Check for being able to broadcast a single element.
14761 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14762 Mask, Subtarget, DAG))
14763 return Broadcast;
14764
14765 // Try to use bit rotation instructions.
14766 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14767 Subtarget, DAG))
14768 return Rotate;
14769
14770 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14771 return V;
14772
14773 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14774 // Notably, this handles splat and partial-splat shuffles more efficiently.
14775 // However, it only makes sense if the pre-duplication shuffle simplifies
14776 // things significantly. Currently, this means we need to be able to
14777 // express the pre-duplication shuffle as an i16 shuffle.
14778 //
14779 // FIXME: We should check for other patterns which can be widened into an
14780 // i16 shuffle as well.
14781 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14782 for (int i = 0; i < 16; i += 2)
14783 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14784 return false;
14785
14786 return true;
14787 };
14788 auto tryToWidenViaDuplication = [&]() -> SDValue {
14789 if (!canWidenViaDuplication(Mask))
14790 return SDValue();
14791 SmallVector<int, 4> LoInputs;
14792 copy_if(Mask, std::back_inserter(LoInputs),
14793 [](int M) { return M >= 0 && M < 8; });
14794 array_pod_sort(LoInputs.begin(), LoInputs.end());
14795 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14796 SmallVector<int, 4> HiInputs;
14797 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14798 array_pod_sort(HiInputs.begin(), HiInputs.end());
14799 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14800
14801 bool TargetLo = LoInputs.size() >= HiInputs.size();
14802 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14803 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14804
14805 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14807 for (int I : InPlaceInputs) {
14808 PreDupI16Shuffle[I/2] = I/2;
14809 LaneMap[I] = I;
14810 }
14811 int j = TargetLo ? 0 : 4, je = j + 4;
14812 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14813 // Check if j is already a shuffle of this input. This happens when
14814 // there are two adjacent bytes after we move the low one.
14815 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14816 // If we haven't yet mapped the input, search for a slot into which
14817 // we can map it.
14818 while (j < je && PreDupI16Shuffle[j] >= 0)
14819 ++j;
14820
14821 if (j == je)
14822 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14823 return SDValue();
14824
14825 // Map this input with the i16 shuffle.
14826 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14827 }
14828
14829 // Update the lane map based on the mapping we ended up with.
14830 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14831 }
14832 V1 = DAG.getBitcast(
14833 MVT::v16i8,
14834 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14835 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14836
14837 // Unpack the bytes to form the i16s that will be shuffled into place.
14838 bool EvenInUse = false, OddInUse = false;
14839 for (int i = 0; i < 16; i += 2) {
14840 EvenInUse |= (Mask[i + 0] >= 0);
14841 OddInUse |= (Mask[i + 1] >= 0);
14842 if (EvenInUse && OddInUse)
14843 break;
14844 }
14845 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14846 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14847 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14848
14849 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14850 for (int i = 0; i < 16; ++i)
14851 if (Mask[i] >= 0) {
14852 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14853 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14854 if (PostDupI16Shuffle[i / 2] < 0)
14855 PostDupI16Shuffle[i / 2] = MappedMask;
14856 else
14857 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14858 "Conflicting entries in the original shuffle!");
14859 }
14860 return DAG.getBitcast(
14861 MVT::v16i8,
14862 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14863 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14864 };
14865 if (SDValue V = tryToWidenViaDuplication())
14866 return V;
14867 }
14868
14869 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14870 Zeroable, Subtarget, DAG))
14871 return Masked;
14872
14873 // Use dedicated unpack instructions for masks that match their pattern.
14874 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14875 return V;
14876
14877 // Try to use byte shift instructions to mask.
14878 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14879 Zeroable, Subtarget, DAG))
14880 return V;
14881
14882 // Check for compaction patterns.
14883 bool IsSingleInput = V2.isUndef();
14884 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14885
14886 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14887 // with PSHUFB. It is important to do this before we attempt to generate any
14888 // blends but after all of the single-input lowerings. If the single input
14889 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14890 // want to preserve that and we can DAG combine any longer sequences into
14891 // a PSHUFB in the end. But once we start blending from multiple inputs,
14892 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14893 // and there are *very* few patterns that would actually be faster than the
14894 // PSHUFB approach because of its ability to zero lanes.
14895 //
14896 // If the mask is a binary compaction, we can more efficiently perform this
14897 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14898 //
14899 // FIXME: The only exceptions to the above are blends which are exact
14900 // interleavings with direct instructions supporting them. We currently don't
14901 // handle those well here.
14902 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14903 bool V1InUse = false;
14904 bool V2InUse = false;
14905
14907 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14908
14909 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14910 // do so. This avoids using them to handle blends-with-zero which is
14911 // important as a single pshufb is significantly faster for that.
14912 if (V1InUse && V2InUse) {
14913 if (Subtarget.hasSSE41())
14914 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14915 Zeroable, Subtarget, DAG))
14916 return Blend;
14917
14918 // We can use an unpack to do the blending rather than an or in some
14919 // cases. Even though the or may be (very minorly) more efficient, we
14920 // preference this lowering because there are common cases where part of
14921 // the complexity of the shuffles goes away when we do the final blend as
14922 // an unpack.
14923 // FIXME: It might be worth trying to detect if the unpack-feeding
14924 // shuffles will both be pshufb, in which case we shouldn't bother with
14925 // this.
14927 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14928 return Unpack;
14929
14930 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14931 if (Subtarget.hasVBMI())
14932 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14933 DAG);
14934
14935 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14936 if (Subtarget.hasXOP()) {
14937 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14938 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14939 }
14940
14941 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14942 // PALIGNR will be cheaper than the second PSHUFB+OR.
14944 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14945 return V;
14946 }
14947
14948 return PSHUFB;
14949 }
14950
14951 // There are special ways we can lower some single-element blends.
14952 if (NumV2Elements == 1)
14954 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14955 return V;
14956
14957 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14958 return Blend;
14959
14960 // Check whether a compaction lowering can be done. This handles shuffles
14961 // which take every Nth element for some even N. See the helper function for
14962 // details.
14963 //
14964 // We special case these as they can be particularly efficiently handled with
14965 // the PACKUSB instruction on x86 and they show up in common patterns of
14966 // rearranging bytes to truncate wide elements.
14967 if (NumEvenDrops) {
14968 // NumEvenDrops is the power of two stride of the elements. Another way of
14969 // thinking about it is that we need to drop the even elements this many
14970 // times to get the original input.
14971
14972 // First we need to zero all the dropped bytes.
14973 assert(NumEvenDrops <= 3 &&
14974 "No support for dropping even elements more than 3 times.");
14975 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14976 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14977 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14978 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14979 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14980 WordClearMask);
14981 if (!IsSingleInput)
14982 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14983 WordClearMask);
14984
14985 // Now pack things back together.
14986 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14987 IsSingleInput ? V1 : V2);
14988 for (int i = 1; i < NumEvenDrops; ++i) {
14989 Result = DAG.getBitcast(MVT::v8i16, Result);
14990 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14991 }
14992 return Result;
14993 }
14994
14995 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14996 if (NumOddDrops == 1) {
14997 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14998 DAG.getBitcast(MVT::v8i16, V1),
14999 DAG.getTargetConstant(8, DL, MVT::i8));
15000 if (!IsSingleInput)
15001 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15002 DAG.getBitcast(MVT::v8i16, V2),
15003 DAG.getTargetConstant(8, DL, MVT::i8));
15004 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15005 IsSingleInput ? V1 : V2);
15006 }
15007
15008 // Handle multi-input cases by blending/unpacking single-input shuffles.
15009 if (NumV2Elements > 0)
15010 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15011 Zeroable, Subtarget, DAG);
15012
15013 // The fallback path for single-input shuffles widens this into two v8i16
15014 // vectors with unpacks, shuffles those, and then pulls them back together
15015 // with a pack.
15016 SDValue V = V1;
15017
15018 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15019 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15020 for (int i = 0; i < 16; ++i)
15021 if (Mask[i] >= 0)
15022 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15023
15024 SDValue VLoHalf, VHiHalf;
15025 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15026 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15027 // i16s.
15028 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15029 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15030 // Use a mask to drop the high bytes.
15031 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15032 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15033 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15034
15035 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15036 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15037
15038 // Squash the masks to point directly into VLoHalf.
15039 for (int &M : LoBlendMask)
15040 if (M >= 0)
15041 M /= 2;
15042 for (int &M : HiBlendMask)
15043 if (M >= 0)
15044 M /= 2;
15045 } else {
15046 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15047 // VHiHalf so that we can blend them as i16s.
15048 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15049
15050 VLoHalf = DAG.getBitcast(
15051 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15052 VHiHalf = DAG.getBitcast(
15053 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15054 }
15055
15056 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15057 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15058
15059 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15060}
15061
15062/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15063///
15064/// This routine breaks down the specific type of 128-bit shuffle and
15065/// dispatches to the lowering routines accordingly.
15067 MVT VT, SDValue V1, SDValue V2,
15068 const APInt &Zeroable,
15069 const X86Subtarget &Subtarget,
15070 SelectionDAG &DAG) {
15071 if (VT == MVT::v8bf16) {
15072 V1 = DAG.getBitcast(MVT::v8i16, V1);
15073 V2 = DAG.getBitcast(MVT::v8i16, V2);
15074 return DAG.getBitcast(VT,
15075 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15076 }
15077
15078 switch (VT.SimpleTy) {
15079 case MVT::v2i64:
15080 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15081 case MVT::v2f64:
15082 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15083 case MVT::v4i32:
15084 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15085 case MVT::v4f32:
15086 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15087 case MVT::v8i16:
15088 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15089 case MVT::v8f16:
15090 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15091 case MVT::v16i8:
15092 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15093
15094 default:
15095 llvm_unreachable("Unimplemented!");
15096 }
15097}
15098
15099/// Generic routine to split vector shuffle into half-sized shuffles.
15100///
15101/// This routine just extracts two subvectors, shuffles them independently, and
15102/// then concatenates them back together. This should work effectively with all
15103/// AVX vector shuffle types.
15105 SDValue V2, ArrayRef<int> Mask,
15106 SelectionDAG &DAG, bool SimpleOnly) {
15107 assert(VT.getSizeInBits() >= 256 &&
15108 "Only for 256-bit or wider vector shuffles!");
15109 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15110 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15111
15112 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15113 if (VT == MVT::v8f32) {
15114 SDValue BC1 = peekThroughBitcasts(V1);
15115 SDValue BC2 = peekThroughBitcasts(V2);
15116 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15117 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15118 DAG, SimpleOnly))
15119 return DAG.getBitcast(VT, Split);
15120 }
15121 }
15122
15123 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15124 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15125
15126 int NumElements = VT.getVectorNumElements();
15127 int SplitNumElements = NumElements / 2;
15128 MVT ScalarVT = VT.getVectorElementType();
15129 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15130
15131 // Use splitVector/extractSubVector so that split build-vectors just build two
15132 // narrower build vectors. This helps shuffling with splats and zeros.
15133 auto SplitVector = [&](SDValue V) {
15134 SDValue LoV, HiV;
15135 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15136 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15137 DAG.getBitcast(SplitVT, HiV));
15138 };
15139
15140 SDValue LoV1, HiV1, LoV2, HiV2;
15141 std::tie(LoV1, HiV1) = SplitVector(V1);
15142 std::tie(LoV2, HiV2) = SplitVector(V2);
15143
15144 // Now create two 4-way blends of these half-width vectors.
15145 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15146 bool &UseHiV1, bool &UseLoV2,
15147 bool &UseHiV2) {
15148 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15149 for (int i = 0; i < SplitNumElements; ++i) {
15150 int M = HalfMask[i];
15151 if (M >= NumElements) {
15152 if (M >= NumElements + SplitNumElements)
15153 UseHiV2 = true;
15154 else
15155 UseLoV2 = true;
15156 } else if (M >= 0) {
15157 if (M >= SplitNumElements)
15158 UseHiV1 = true;
15159 else
15160 UseLoV1 = true;
15161 }
15162 }
15163 };
15164
15165 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15166 if (!SimpleOnly)
15167 return true;
15168
15169 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15170 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15171
15172 return !(UseHiV1 || UseHiV2);
15173 };
15174
15175 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15176 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15177 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15178 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15179 for (int i = 0; i < SplitNumElements; ++i) {
15180 int M = HalfMask[i];
15181 if (M >= NumElements) {
15182 V2BlendMask[i] = M - NumElements;
15183 BlendMask[i] = SplitNumElements + i;
15184 } else if (M >= 0) {
15185 V1BlendMask[i] = M;
15186 BlendMask[i] = i;
15187 }
15188 }
15189
15190 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15191 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15192
15193 // Because the lowering happens after all combining takes place, we need to
15194 // manually combine these blend masks as much as possible so that we create
15195 // a minimal number of high-level vector shuffle nodes.
15196 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15197
15198 // First try just blending the halves of V1 or V2.
15199 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15200 return DAG.getUNDEF(SplitVT);
15201 if (!UseLoV2 && !UseHiV2)
15202 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15203 if (!UseLoV1 && !UseHiV1)
15204 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15205
15206 SDValue V1Blend, V2Blend;
15207 if (UseLoV1 && UseHiV1) {
15208 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15209 } else {
15210 // We only use half of V1 so map the usage down into the final blend mask.
15211 V1Blend = UseLoV1 ? LoV1 : HiV1;
15212 for (int i = 0; i < SplitNumElements; ++i)
15213 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15214 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15215 }
15216 if (UseLoV2 && UseHiV2) {
15217 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15218 } else {
15219 // We only use half of V2 so map the usage down into the final blend mask.
15220 V2Blend = UseLoV2 ? LoV2 : HiV2;
15221 for (int i = 0; i < SplitNumElements; ++i)
15222 if (BlendMask[i] >= SplitNumElements)
15223 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15224 }
15225 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15226 };
15227
15228 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15229 return SDValue();
15230
15231 SDValue Lo = HalfBlend(LoMask);
15232 SDValue Hi = HalfBlend(HiMask);
15233 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15234}
15235
15236/// Either split a vector in halves or decompose the shuffles and the
15237/// blend/unpack.
15238///
15239/// This is provided as a good fallback for many lowerings of non-single-input
15240/// shuffles with more than one 128-bit lane. In those cases, we want to select
15241/// between splitting the shuffle into 128-bit components and stitching those
15242/// back together vs. extracting the single-input shuffles and blending those
15243/// results.
15245 SDValue V2, ArrayRef<int> Mask,
15246 const APInt &Zeroable,
15247 const X86Subtarget &Subtarget,
15248 SelectionDAG &DAG) {
15249 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15250 "shuffles as it could then recurse on itself.");
15251 int Size = Mask.size();
15252
15253 // If this can be modeled as a broadcast of two elements followed by a blend,
15254 // prefer that lowering. This is especially important because broadcasts can
15255 // often fold with memory operands.
15256 auto DoBothBroadcast = [&] {
15257 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15258 for (int M : Mask)
15259 if (M >= Size) {
15260 if (V2BroadcastIdx < 0)
15261 V2BroadcastIdx = M - Size;
15262 else if ((M - Size) != V2BroadcastIdx &&
15263 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15264 return false;
15265 } else if (M >= 0) {
15266 if (V1BroadcastIdx < 0)
15267 V1BroadcastIdx = M;
15268 else if (M != V1BroadcastIdx &&
15269 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15270 return false;
15271 }
15272 return true;
15273 };
15274 if (DoBothBroadcast())
15275 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15276 Subtarget, DAG);
15277
15278 // If the inputs all stem from a single 128-bit lane of each input, then we
15279 // split them rather than blending because the split will decompose to
15280 // unusually few instructions.
15281 int LaneCount = VT.getSizeInBits() / 128;
15282 int LaneSize = Size / LaneCount;
15283 SmallBitVector LaneInputs[2];
15284 LaneInputs[0].resize(LaneCount, false);
15285 LaneInputs[1].resize(LaneCount, false);
15286 for (int i = 0; i < Size; ++i)
15287 if (Mask[i] >= 0)
15288 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15289 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15290 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15291 /*SimpleOnly*/ false);
15292
15293 // Without AVX2, if we can freely split the subvectors then we're better off
15294 // performing half width shuffles.
15295 if (!Subtarget.hasAVX2()) {
15296 SDValue BC1 = peekThroughBitcasts(V1);
15297 SDValue BC2 = peekThroughBitcasts(V2);
15298 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15299 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15300 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15301 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15302 if (SplatOrSplitV1 && SplatOrSplitV2)
15303 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15304 /*SimpleOnly*/ false);
15305 }
15306
15307 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15308 // requires that the decomposed single-input shuffles don't end up here.
15309 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15310 Subtarget, DAG);
15311}
15312
15313// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15314// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15316 SDValue V1, SDValue V2,
15317 ArrayRef<int> Mask,
15318 SelectionDAG &DAG) {
15319 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15320
15321 int LHSMask[4] = {-1, -1, -1, -1};
15322 int RHSMask[4] = {-1, -1, -1, -1};
15323 int SHUFPDMask[4] = {-1, -1, -1, -1};
15324
15325 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15326 // perform the shuffle once the lanes have been shuffled in place.
15327 for (int i = 0; i != 4; ++i) {
15328 int M = Mask[i];
15329 if (M < 0)
15330 continue;
15331 int LaneBase = i & ~1;
15332 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15333 LaneMask[LaneBase + (M & 1)] = M;
15334 SHUFPDMask[i] = M & 1;
15335 }
15336
15337 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15338 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15339 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15340 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15341}
15342
15343/// Lower a vector shuffle crossing multiple 128-bit lanes as
15344/// a lane permutation followed by a per-lane permutation.
15345///
15346/// This is mainly for cases where we can have non-repeating permutes
15347/// in each lane.
15348///
15349/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15350/// we should investigate merging them.
15352 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15353 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15354 int NumElts = VT.getVectorNumElements();
15355 int NumLanes = VT.getSizeInBits() / 128;
15356 int NumEltsPerLane = NumElts / NumLanes;
15357 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15358
15359 /// Attempts to find a sublane permute with the given size
15360 /// that gets all elements into their target lanes.
15361 ///
15362 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15363 /// If unsuccessful, returns false and may overwrite InLaneMask.
15364 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15365 int NumSublanesPerLane = NumSublanes / NumLanes;
15366 int NumEltsPerSublane = NumElts / NumSublanes;
15367
15368 SmallVector<int, 16> CrossLaneMask;
15369 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15370 // CrossLaneMask but one entry == one sublane.
15371 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15372 APInt DemandedCrossLane = APInt::getZero(NumElts);
15373
15374 for (int i = 0; i != NumElts; ++i) {
15375 int M = Mask[i];
15376 if (M < 0)
15377 continue;
15378
15379 int SrcSublane = M / NumEltsPerSublane;
15380 int DstLane = i / NumEltsPerLane;
15381
15382 // We only need to get the elements into the right lane, not sublane.
15383 // So search all sublanes that make up the destination lane.
15384 bool Found = false;
15385 int DstSubStart = DstLane * NumSublanesPerLane;
15386 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15387 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15388 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15389 continue;
15390
15391 Found = true;
15392 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15393 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15394 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15395 DemandedCrossLane.setBit(InLaneMask[i]);
15396 break;
15397 }
15398 if (!Found)
15399 return SDValue();
15400 }
15401
15402 // Fill CrossLaneMask using CrossLaneMaskLarge.
15403 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15404
15405 if (!CanUseSublanes) {
15406 // If we're only shuffling a single lowest lane and the rest are identity
15407 // then don't bother.
15408 // TODO - isShuffleMaskInputInPlace could be extended to something like
15409 // this.
15410 int NumIdentityLanes = 0;
15411 bool OnlyShuffleLowestLane = true;
15412 for (int i = 0; i != NumLanes; ++i) {
15413 int LaneOffset = i * NumEltsPerLane;
15414 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15415 i * NumEltsPerLane))
15416 NumIdentityLanes++;
15417 else if (CrossLaneMask[LaneOffset] != 0)
15418 OnlyShuffleLowestLane = false;
15419 }
15420 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15421 return SDValue();
15422 }
15423
15424 // Simplify CrossLaneMask based on the actual demanded elements.
15425 if (V1.hasOneUse())
15426 for (int i = 0; i != NumElts; ++i)
15427 if (!DemandedCrossLane[i])
15428 CrossLaneMask[i] = SM_SentinelUndef;
15429
15430 // Avoid returning the same shuffle operation. For example,
15431 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15432 // undef:v16i16
15433 if (CrossLaneMask == Mask || InLaneMask == Mask)
15434 return SDValue();
15435
15436 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15437 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15438 InLaneMask);
15439 };
15440
15441 // First attempt a solution with full lanes.
15442 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15443 return V;
15444
15445 // The rest of the solutions use sublanes.
15446 if (!CanUseSublanes)
15447 return SDValue();
15448
15449 // Then attempt a solution with 64-bit sublanes (vpermq).
15450 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15451 return V;
15452
15453 // If that doesn't work and we have fast variable cross-lane shuffle,
15454 // attempt 32-bit sublanes (vpermd).
15455 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15456 return SDValue();
15457
15458 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15459}
15460
15461/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15462static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15463 SmallVector<int> &InLaneMask) {
15464 int Size = Mask.size();
15465 InLaneMask.assign(Mask.begin(), Mask.end());
15466 for (int i = 0; i < Size; ++i) {
15467 int &M = InLaneMask[i];
15468 if (M < 0)
15469 continue;
15470 if (((M % Size) / LaneSize) != (i / LaneSize))
15471 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15472 }
15473}
15474
15475/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15476/// source with a lane permutation.
15477///
15478/// This lowering strategy results in four instructions in the worst case for a
15479/// single-input cross lane shuffle which is lower than any other fully general
15480/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15481/// shuffle pattern should be handled prior to trying this lowering.
15483 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15484 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15485 // FIXME: This should probably be generalized for 512-bit vectors as well.
15486 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15487 int Size = Mask.size();
15488 int LaneSize = Size / 2;
15489
15490 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15491 // Only do this if the elements aren't all from the lower lane,
15492 // otherwise we're (probably) better off doing a split.
15493 if (VT == MVT::v4f64 &&
15494 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15495 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15496
15497 // If there are only inputs from one 128-bit lane, splitting will in fact be
15498 // less expensive. The flags track whether the given lane contains an element
15499 // that crosses to another lane.
15500 bool AllLanes;
15501 if (!Subtarget.hasAVX2()) {
15502 bool LaneCrossing[2] = {false, false};
15503 for (int i = 0; i < Size; ++i)
15504 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15505 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15506 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15507 } else {
15508 bool LaneUsed[2] = {false, false};
15509 for (int i = 0; i < Size; ++i)
15510 if (Mask[i] >= 0)
15511 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15512 AllLanes = LaneUsed[0] && LaneUsed[1];
15513 }
15514
15515 // TODO - we could support shuffling V2 in the Flipped input.
15516 assert(V2.isUndef() &&
15517 "This last part of this routine only works on single input shuffles");
15518
15519 SmallVector<int> InLaneMask;
15520 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15521
15522 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15523 "In-lane shuffle mask expected");
15524
15525 // If we're not using both lanes in each lane and the inlane mask is not
15526 // repeating, then we're better off splitting.
15527 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15528 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15529 /*SimpleOnly*/ false);
15530
15531 // Flip the lanes, and shuffle the results which should now be in-lane.
15532 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15533 SDValue Flipped = DAG.getBitcast(PVT, V1);
15534 Flipped =
15535 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15536 Flipped = DAG.getBitcast(VT, Flipped);
15537 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15538}
15539
15540/// Handle lowering 2-lane 128-bit shuffles.
15542 SDValue V2, ArrayRef<int> Mask,
15543 const APInt &Zeroable,
15544 const X86Subtarget &Subtarget,
15545 SelectionDAG &DAG) {
15546 if (V2.isUndef()) {
15547 // Attempt to match VBROADCAST*128 subvector broadcast load.
15548 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15549 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15550 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15552 MVT MemVT = VT.getHalfNumVectorElementsVT();
15553 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15554 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
15556 VT, MemVT, Ld, Ofs, DAG))
15557 return BcstLd;
15558 }
15559
15560 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15561 if (Subtarget.hasAVX2())
15562 return SDValue();
15563 }
15564
15565 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15566
15567 SmallVector<int, 4> WidenedMask;
15568 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15569 return SDValue();
15570
15571 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15572 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15573
15574 // Try to use an insert into a zero vector.
15575 if (WidenedMask[0] == 0 && IsHighZero) {
15576 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15577 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15578 DAG.getVectorIdxConstant(0, DL));
15579 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15580 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15581 DAG.getVectorIdxConstant(0, DL));
15582 }
15583
15584 // TODO: If minimizing size and one of the inputs is a zero vector and the
15585 // the zero vector has only one use, we could use a VPERM2X128 to save the
15586 // instruction bytes needed to explicitly generate the zero vector.
15587
15588 // Blends are faster and handle all the non-lane-crossing cases.
15589 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15590 Subtarget, DAG))
15591 return Blend;
15592
15593 // If either input operand is a zero vector, use VPERM2X128 because its mask
15594 // allows us to replace the zero input with an implicit zero.
15595 if (!IsLowZero && !IsHighZero) {
15596 // Check for patterns which can be matched with a single insert of a 128-bit
15597 // subvector.
15598 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15599 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15600
15601 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15602 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15603 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15604 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15605 SDValue SubVec =
15606 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15607 DAG.getVectorIdxConstant(0, DL));
15608 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15609 DAG.getVectorIdxConstant(2, DL));
15610 }
15611 }
15612
15613 // Try to use SHUF128 if possible.
15614 if (Subtarget.hasVLX()) {
15615 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15616 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15617 ((WidenedMask[1] % 2) << 1);
15618 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15619 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15620 }
15621 }
15622 }
15623
15624 // Otherwise form a 128-bit permutation. After accounting for undefs,
15625 // convert the 64-bit shuffle mask selection values into 128-bit
15626 // selection bits by dividing the indexes by 2 and shifting into positions
15627 // defined by a vperm2*128 instruction's immediate control byte.
15628
15629 // The immediate permute control byte looks like this:
15630 // [1:0] - select 128 bits from sources for low half of destination
15631 // [2] - ignore
15632 // [3] - zero low half of destination
15633 // [5:4] - select 128 bits from sources for high half of destination
15634 // [6] - ignore
15635 // [7] - zero high half of destination
15636
15637 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15638 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15639
15640 unsigned PermMask = 0;
15641 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15642 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15643
15644 // Check the immediate mask and replace unused sources with undef.
15645 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15646 V1 = DAG.getUNDEF(VT);
15647 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15648 V2 = DAG.getUNDEF(VT);
15649
15650 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15651 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15652}
15653
15654/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15655/// shuffling each lane.
15656///
15657/// This attempts to create a repeated lane shuffle where each lane uses one
15658/// or two of the lanes of the inputs. The lanes of the input vectors are
15659/// shuffled in one or two independent shuffles to get the lanes into the
15660/// position needed by the final shuffle.
15662 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15663 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15664 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15665
15666 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15667 return SDValue();
15668
15669 int NumElts = Mask.size();
15670 int NumLanes = VT.getSizeInBits() / 128;
15671 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15672 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15673 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15674
15675 // First pass will try to fill in the RepeatMask from lanes that need two
15676 // sources.
15677 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15678 int Srcs[2] = {-1, -1};
15679 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15680 for (int i = 0; i != NumLaneElts; ++i) {
15681 int M = Mask[(Lane * NumLaneElts) + i];
15682 if (M < 0)
15683 continue;
15684 // Determine which of the possible input lanes (NumLanes from each source)
15685 // this element comes from. Assign that as one of the sources for this
15686 // lane. We can assign up to 2 sources for this lane. If we run out
15687 // sources we can't do anything.
15688 int LaneSrc = M / NumLaneElts;
15689 int Src;
15690 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15691 Src = 0;
15692 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15693 Src = 1;
15694 else
15695 return SDValue();
15696
15697 Srcs[Src] = LaneSrc;
15698 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15699 }
15700
15701 // If this lane has two sources, see if it fits with the repeat mask so far.
15702 if (Srcs[1] < 0)
15703 continue;
15704
15705 LaneSrcs[Lane][0] = Srcs[0];
15706 LaneSrcs[Lane][1] = Srcs[1];
15707
15708 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15709 assert(M1.size() == M2.size() && "Unexpected mask size");
15710 for (int i = 0, e = M1.size(); i != e; ++i)
15711 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15712 return false;
15713 return true;
15714 };
15715
15716 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15717 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15718 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15719 int M = Mask[i];
15720 if (M < 0)
15721 continue;
15722 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15723 "Unexpected mask element");
15724 MergedMask[i] = M;
15725 }
15726 };
15727
15728 if (MatchMasks(InLaneMask, RepeatMask)) {
15729 // Merge this lane mask into the final repeat mask.
15730 MergeMasks(InLaneMask, RepeatMask);
15731 continue;
15732 }
15733
15734 // Didn't find a match. Swap the operands and try again.
15735 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15737
15738 if (MatchMasks(InLaneMask, RepeatMask)) {
15739 // Merge this lane mask into the final repeat mask.
15740 MergeMasks(InLaneMask, RepeatMask);
15741 continue;
15742 }
15743
15744 // Couldn't find a match with the operands in either order.
15745 return SDValue();
15746 }
15747
15748 // Now handle any lanes with only one source.
15749 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15750 // If this lane has already been processed, skip it.
15751 if (LaneSrcs[Lane][0] >= 0)
15752 continue;
15753
15754 for (int i = 0; i != NumLaneElts; ++i) {
15755 int M = Mask[(Lane * NumLaneElts) + i];
15756 if (M < 0)
15757 continue;
15758
15759 // If RepeatMask isn't defined yet we can define it ourself.
15760 if (RepeatMask[i] < 0)
15761 RepeatMask[i] = M % NumLaneElts;
15762
15763 if (RepeatMask[i] < NumElts) {
15764 if (RepeatMask[i] != M % NumLaneElts)
15765 return SDValue();
15766 LaneSrcs[Lane][0] = M / NumLaneElts;
15767 } else {
15768 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15769 return SDValue();
15770 LaneSrcs[Lane][1] = M / NumLaneElts;
15771 }
15772 }
15773
15774 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15775 return SDValue();
15776 }
15777
15778 SmallVector<int, 16> NewMask(NumElts, -1);
15779 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15780 int Src = LaneSrcs[Lane][0];
15781 for (int i = 0; i != NumLaneElts; ++i) {
15782 int M = -1;
15783 if (Src >= 0)
15784 M = Src * NumLaneElts + i;
15785 NewMask[Lane * NumLaneElts + i] = M;
15786 }
15787 }
15788 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15789 // Ensure we didn't get back the shuffle we started with.
15790 // FIXME: This is a hack to make up for some splat handling code in
15791 // getVectorShuffle.
15792 if (isa<ShuffleVectorSDNode>(NewV1) &&
15793 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15794 return SDValue();
15795
15796 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15797 int Src = LaneSrcs[Lane][1];
15798 for (int i = 0; i != NumLaneElts; ++i) {
15799 int M = -1;
15800 if (Src >= 0)
15801 M = Src * NumLaneElts + i;
15802 NewMask[Lane * NumLaneElts + i] = M;
15803 }
15804 }
15805 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15806 // Ensure we didn't get back the shuffle we started with.
15807 // FIXME: This is a hack to make up for some splat handling code in
15808 // getVectorShuffle.
15809 if (isa<ShuffleVectorSDNode>(NewV2) &&
15810 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15811 return SDValue();
15812
15813 for (int i = 0; i != NumElts; ++i) {
15814 if (Mask[i] < 0) {
15815 NewMask[i] = -1;
15816 continue;
15817 }
15818 NewMask[i] = RepeatMask[i % NumLaneElts];
15819 if (NewMask[i] < 0)
15820 continue;
15821
15822 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15823 }
15824 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15825}
15826
15827/// If the input shuffle mask results in a vector that is undefined in all upper
15828/// or lower half elements and that mask accesses only 2 halves of the
15829/// shuffle's operands, return true. A mask of half the width with mask indexes
15830/// adjusted to access the extracted halves of the original shuffle operands is
15831/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15832/// lower half of each input operand is accessed.
15833static bool
15835 int &HalfIdx1, int &HalfIdx2) {
15836 assert((Mask.size() == HalfMask.size() * 2) &&
15837 "Expected input mask to be twice as long as output");
15838
15839 // Exactly one half of the result must be undef to allow narrowing.
15840 bool UndefLower = isUndefLowerHalf(Mask);
15841 bool UndefUpper = isUndefUpperHalf(Mask);
15842 if (UndefLower == UndefUpper)
15843 return false;
15844
15845 unsigned HalfNumElts = HalfMask.size();
15846 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15847 HalfIdx1 = -1;
15848 HalfIdx2 = -1;
15849 for (unsigned i = 0; i != HalfNumElts; ++i) {
15850 int M = Mask[i + MaskIndexOffset];
15851 if (M < 0) {
15852 HalfMask[i] = M;
15853 continue;
15854 }
15855
15856 // Determine which of the 4 half vectors this element is from.
15857 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15858 int HalfIdx = M / HalfNumElts;
15859
15860 // Determine the element index into its half vector source.
15861 int HalfElt = M % HalfNumElts;
15862
15863 // We can shuffle with up to 2 half vectors, set the new 'half'
15864 // shuffle mask accordingly.
15865 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15866 HalfMask[i] = HalfElt;
15867 HalfIdx1 = HalfIdx;
15868 continue;
15869 }
15870 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15871 HalfMask[i] = HalfElt + HalfNumElts;
15872 HalfIdx2 = HalfIdx;
15873 continue;
15874 }
15875
15876 // Too many half vectors referenced.
15877 return false;
15878 }
15879
15880 return true;
15881}
15882
15883/// Given the output values from getHalfShuffleMask(), create a half width
15884/// shuffle of extracted vectors followed by an insert back to full width.
15886 ArrayRef<int> HalfMask, int HalfIdx1,
15887 int HalfIdx2, bool UndefLower,
15888 SelectionDAG &DAG, bool UseConcat = false) {
15889 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15890 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15891
15892 MVT VT = V1.getSimpleValueType();
15893 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15894 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15895
15896 auto getHalfVector = [&](int HalfIdx) {
15897 if (HalfIdx < 0)
15898 return DAG.getUNDEF(HalfVT);
15899 SDValue V = (HalfIdx < 2 ? V1 : V2);
15900 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15901 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15902 DAG.getVectorIdxConstant(HalfIdx, DL));
15903 };
15904
15905 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15906 SDValue Half1 = getHalfVector(HalfIdx1);
15907 SDValue Half2 = getHalfVector(HalfIdx2);
15908 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15909 if (UseConcat) {
15910 SDValue Op0 = V;
15911 SDValue Op1 = DAG.getUNDEF(HalfVT);
15912 if (UndefLower)
15913 std::swap(Op0, Op1);
15914 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15915 }
15916
15917 unsigned Offset = UndefLower ? HalfNumElts : 0;
15918 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15920}
15921
15922/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15923/// This allows for fast cases such as subvector extraction/insertion
15924/// or shuffling smaller vector types which can lower more efficiently.
15926 SDValue V2, ArrayRef<int> Mask,
15927 const X86Subtarget &Subtarget,
15928 SelectionDAG &DAG) {
15929 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15930 "Expected 256-bit or 512-bit vector");
15931
15932 bool UndefLower = isUndefLowerHalf(Mask);
15933 if (!UndefLower && !isUndefUpperHalf(Mask))
15934 return SDValue();
15935
15936 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15937 "Completely undef shuffle mask should have been simplified already");
15938
15939 // Upper half is undef and lower half is whole upper subvector.
15940 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15941 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15942 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15943 if (!UndefLower &&
15944 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15945 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15946 DAG.getVectorIdxConstant(HalfNumElts, DL));
15947 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15948 DAG.getVectorIdxConstant(0, DL));
15949 }
15950
15951 // Lower half is undef and upper half is whole lower subvector.
15952 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15953 if (UndefLower &&
15954 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15955 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15956 DAG.getVectorIdxConstant(0, DL));
15957 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15958 DAG.getVectorIdxConstant(HalfNumElts, DL));
15959 }
15960
15961 int HalfIdx1, HalfIdx2;
15962 SmallVector<int, 8> HalfMask(HalfNumElts);
15963 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15964 return SDValue();
15965
15966 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15967
15968 // Only shuffle the halves of the inputs when useful.
15969 unsigned NumLowerHalves =
15970 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15971 unsigned NumUpperHalves =
15972 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15973 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15974
15975 // Determine the larger pattern of undef/halves, then decide if it's worth
15976 // splitting the shuffle based on subtarget capabilities and types.
15977 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15978 if (!UndefLower) {
15979 // XXXXuuuu: no insert is needed.
15980 // Always extract lowers when setting lower - these are all free subreg ops.
15981 if (NumUpperHalves == 0)
15982 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15983 UndefLower, DAG);
15984
15985 if (NumUpperHalves == 1) {
15986 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15987 if (Subtarget.hasAVX2()) {
15988 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15989 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15990 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15991 (!isSingleSHUFPSMask(HalfMask) ||
15992 Subtarget.hasFastVariableCrossLaneShuffle()))
15993 return SDValue();
15994 // If this is an unary shuffle (assume that the 2nd operand is
15995 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15996 // are better off extracting the upper half of 1 operand and using a
15997 // narrow shuffle.
15998 if (EltWidth == 64 && V2.isUndef())
15999 return SDValue();
16000 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16001 // full width pshufb, and then merge.
16002 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16003 return SDValue();
16004 }
16005 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16006 if (Subtarget.hasAVX512() && VT.is512BitVector())
16007 return SDValue();
16008 // Extract + narrow shuffle is better than the wide alternative.
16009 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16010 UndefLower, DAG);
16011 }
16012
16013 // Don't extract both uppers, instead shuffle and then extract.
16014 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16015 return SDValue();
16016 }
16017
16018 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16019 if (NumUpperHalves == 0) {
16020 // AVX2 has efficient 64-bit element cross-lane shuffles.
16021 // TODO: Refine to account for unary shuffle, splat, and other masks?
16022 if (Subtarget.hasAVX2() && EltWidth == 64)
16023 return SDValue();
16024 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16025 if (Subtarget.hasAVX512() && VT.is512BitVector())
16026 return SDValue();
16027 // Narrow shuffle + insert is better than the wide alternative.
16028 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16029 UndefLower, DAG);
16030 }
16031
16032 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16033 return SDValue();
16034}
16035
16036/// Handle case where shuffle sources are coming from the same 128-bit lane and
16037/// every lane can be represented as the same repeating mask - allowing us to
16038/// shuffle the sources with the repeating shuffle and then permute the result
16039/// to the destination lanes.
16041 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16042 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16043 int NumElts = VT.getVectorNumElements();
16044 int NumLanes = VT.getSizeInBits() / 128;
16045 int NumLaneElts = NumElts / NumLanes;
16046
16047 // On AVX2 we may be able to just shuffle the lowest elements and then
16048 // broadcast the result.
16049 if (Subtarget.hasAVX2()) {
16050 for (unsigned BroadcastSize : {16, 32, 64}) {
16051 if (BroadcastSize <= VT.getScalarSizeInBits())
16052 continue;
16053 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16054
16055 // Attempt to match a repeating pattern every NumBroadcastElts,
16056 // accounting for UNDEFs but only references the lowest 128-bit
16057 // lane of the inputs.
16058 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16059 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16060 for (int j = 0; j != NumBroadcastElts; ++j) {
16061 int M = Mask[i + j];
16062 if (M < 0)
16063 continue;
16064 int &R = RepeatMask[j];
16065 if (0 != ((M % NumElts) / NumLaneElts))
16066 return false;
16067 if (0 <= R && R != M)
16068 return false;
16069 R = M;
16070 }
16071 return true;
16072 };
16073
16074 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16075 if (!FindRepeatingBroadcastMask(RepeatMask))
16076 continue;
16077
16078 // Shuffle the (lowest) repeated elements in place for broadcast.
16079 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16080
16081 // Shuffle the actual broadcast.
16082 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16083 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16084 for (int j = 0; j != NumBroadcastElts; ++j)
16085 BroadcastMask[i + j] = j;
16086
16087 // Avoid returning the same shuffle operation. For example,
16088 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16089 if (BroadcastMask == Mask)
16090 return SDValue();
16091
16092 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16093 BroadcastMask);
16094 }
16095 }
16096
16097 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16098 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16099 return SDValue();
16100
16101 // Bail if we already have a repeated lane shuffle mask.
16102 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16103 return SDValue();
16104
16105 // Helper to look for repeated mask in each split sublane, and that those
16106 // sublanes can then be permuted into place.
16107 auto ShuffleSubLanes = [&](int SubLaneScale) {
16108 int NumSubLanes = NumLanes * SubLaneScale;
16109 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16110
16111 // Check that all the sources are coming from the same lane and see if we
16112 // can form a repeating shuffle mask (local to each sub-lane). At the same
16113 // time, determine the source sub-lane for each destination sub-lane.
16114 int TopSrcSubLane = -1;
16115 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16116 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16117 SubLaneScale,
16118 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16119
16120 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16121 // Extract the sub-lane mask, check that it all comes from the same lane
16122 // and normalize the mask entries to come from the first lane.
16123 int SrcLane = -1;
16124 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16125 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16126 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16127 if (M < 0)
16128 continue;
16129 int Lane = (M % NumElts) / NumLaneElts;
16130 if ((0 <= SrcLane) && (SrcLane != Lane))
16131 return SDValue();
16132 SrcLane = Lane;
16133 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16134 SubLaneMask[Elt] = LocalM;
16135 }
16136
16137 // Whole sub-lane is UNDEF.
16138 if (SrcLane < 0)
16139 continue;
16140
16141 // Attempt to match against the candidate repeated sub-lane masks.
16142 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16143 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16144 for (int i = 0; i != NumSubLaneElts; ++i) {
16145 if (M1[i] < 0 || M2[i] < 0)
16146 continue;
16147 if (M1[i] != M2[i])
16148 return false;
16149 }
16150 return true;
16151 };
16152
16153 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16154 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16155 continue;
16156
16157 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16158 for (int i = 0; i != NumSubLaneElts; ++i) {
16159 int M = SubLaneMask[i];
16160 if (M < 0)
16161 continue;
16162 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16163 "Unexpected mask element");
16164 RepeatedSubLaneMask[i] = M;
16165 }
16166
16167 // Track the top most source sub-lane - by setting the remaining to
16168 // UNDEF we can greatly simplify shuffle matching.
16169 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16170 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16171 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16172 break;
16173 }
16174
16175 // Bail if we failed to find a matching repeated sub-lane mask.
16176 if (Dst2SrcSubLanes[DstSubLane] < 0)
16177 return SDValue();
16178 }
16179 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16180 "Unexpected source lane");
16181
16182 // Create a repeating shuffle mask for the entire vector.
16183 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16184 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16185 int Lane = SubLane / SubLaneScale;
16186 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16187 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16188 int M = RepeatedSubLaneMask[Elt];
16189 if (M < 0)
16190 continue;
16191 int Idx = (SubLane * NumSubLaneElts) + Elt;
16192 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16193 }
16194 }
16195
16196 // Shuffle each source sub-lane to its destination.
16197 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16198 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16199 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16200 if (SrcSubLane < 0)
16201 continue;
16202 for (int j = 0; j != NumSubLaneElts; ++j)
16203 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16204 }
16205
16206 // Avoid returning the same shuffle operation.
16207 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16208 if (RepeatedMask == Mask || SubLaneMask == Mask)
16209 return SDValue();
16210
16211 SDValue RepeatedShuffle =
16212 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16213
16214 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16215 SubLaneMask);
16216 };
16217
16218 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16219 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16220 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16221 // Otherwise we can only permute whole 128-bit lanes.
16222 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16223 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16224 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16225 MinSubLaneScale = 2;
16226 MaxSubLaneScale =
16227 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16228 }
16229 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16230 MinSubLaneScale = MaxSubLaneScale = 4;
16231
16232 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16233 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16234 return Shuffle;
16235
16236 return SDValue();
16237}
16238
16240 bool &ForceV1Zero, bool &ForceV2Zero,
16241 unsigned &ShuffleImm, ArrayRef<int> Mask,
16242 const APInt &Zeroable) {
16243 int NumElts = VT.getVectorNumElements();
16244 assert(VT.getScalarSizeInBits() == 64 &&
16245 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16246 "Unexpected data type for VSHUFPD");
16247 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16248 "Illegal shuffle mask");
16249
16250 bool ZeroLane[2] = { true, true };
16251 for (int i = 0; i < NumElts; ++i)
16252 ZeroLane[i & 1] &= Zeroable[i];
16253
16254 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16255 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16256 bool IsSHUFPD = true;
16257 bool IsCommutable = true;
16258 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16259 for (int i = 0; i < NumElts; ++i) {
16260 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16261 continue;
16262 if (Mask[i] < 0)
16263 return false;
16264 int Val = (i & 6) + NumElts * (i & 1);
16265 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16266 if (Mask[i] < Val || Mask[i] > Val + 1)
16267 IsSHUFPD = false;
16268 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16269 IsCommutable = false;
16270 SHUFPDMask[i] = Mask[i] % 2;
16271 }
16272
16273 if (!IsSHUFPD && !IsCommutable)
16274 return false;
16275
16276 if (!IsSHUFPD && IsCommutable)
16277 std::swap(V1, V2);
16278
16279 ForceV1Zero = ZeroLane[0];
16280 ForceV2Zero = ZeroLane[1];
16281 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16282 return true;
16283}
16284
16286 SDValue V2, ArrayRef<int> Mask,
16287 const APInt &Zeroable,
16288 const X86Subtarget &Subtarget,
16289 SelectionDAG &DAG) {
16290 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16291 "Unexpected data type for VSHUFPD");
16292
16293 unsigned Immediate = 0;
16294 bool ForceV1Zero = false, ForceV2Zero = false;
16295 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16296 Mask, Zeroable))
16297 return SDValue();
16298
16299 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16300 if (ForceV1Zero)
16301 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16302 if (ForceV2Zero)
16303 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16304
16305 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16306 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16307}
16308
16309// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16310// by zeroable elements in the remaining 24 elements. Turn this into two
16311// vmovqb instructions shuffled together.
16313 SDValue V1, SDValue V2,
16314 ArrayRef<int> Mask,
16315 const APInt &Zeroable,
16316 SelectionDAG &DAG) {
16317 assert(VT == MVT::v32i8 && "Unexpected type!");
16318
16319 // The first 8 indices should be every 8th element.
16320 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16321 return SDValue();
16322
16323 // Remaining elements need to be zeroable.
16324 if (Zeroable.countl_one() < (Mask.size() - 8))
16325 return SDValue();
16326
16327 V1 = DAG.getBitcast(MVT::v4i64, V1);
16328 V2 = DAG.getBitcast(MVT::v4i64, V2);
16329
16330 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16331 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16332
16333 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16334 // the upper bits of the result using an unpckldq.
16335 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16336 { 0, 1, 2, 3, 16, 17, 18, 19,
16337 4, 5, 6, 7, 20, 21, 22, 23 });
16338 // Insert the unpckldq into a zero vector to widen to v32i8.
16339 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16340 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16341 DAG.getVectorIdxConstant(0, DL));
16342}
16343
16344// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16345// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16346// =>
16347// ul = unpckl v1, v2
16348// uh = unpckh v1, v2
16349// a = vperm ul, uh
16350// b = vperm ul, uh
16351//
16352// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16353// and permute. We cannot directly match v3 because it is split into two
16354// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16355// pair of 256-bit shuffles and makes sure the masks are consecutive.
16356//
16357// Once unpck and permute nodes are created, the permute corresponding to this
16358// shuffle is returned, while the other permute replaces the other half of the
16359// shuffle in the selection dag.
16361 SDValue V1, SDValue V2,
16362 ArrayRef<int> Mask,
16363 SelectionDAG &DAG) {
16364 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16365 VT != MVT::v32i8)
16366 return SDValue();
16367 // <B0, B1, B0+1, B1+1, ..., >
16368 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16369 unsigned Begin1) {
16370 size_t Size = Mask.size();
16371 assert(Size % 2 == 0 && "Expected even mask size");
16372 for (unsigned I = 0; I < Size; I += 2) {
16373 if (Mask[I] != (int)(Begin0 + I / 2) ||
16374 Mask[I + 1] != (int)(Begin1 + I / 2))
16375 return false;
16376 }
16377 return true;
16378 };
16379 // Check which half is this shuffle node
16380 int NumElts = VT.getVectorNumElements();
16381 size_t FirstQtr = NumElts / 2;
16382 size_t ThirdQtr = NumElts + NumElts / 2;
16383 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16384 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16385 if (!IsFirstHalf && !IsSecondHalf)
16386 return SDValue();
16387
16388 // Find the intersection between shuffle users of V1 and V2.
16389 SmallVector<SDNode *, 2> Shuffles;
16390 for (SDNode *User : V1->users())
16391 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16392 User->getOperand(1) == V2)
16393 Shuffles.push_back(User);
16394 // Limit user size to two for now.
16395 if (Shuffles.size() != 2)
16396 return SDValue();
16397 // Find out which half of the 512-bit shuffles is each smaller shuffle
16398 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16399 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16400 SDNode *FirstHalf;
16401 SDNode *SecondHalf;
16402 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16403 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16404 FirstHalf = Shuffles[0];
16405 SecondHalf = Shuffles[1];
16406 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16407 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16408 FirstHalf = Shuffles[1];
16409 SecondHalf = Shuffles[0];
16410 } else {
16411 return SDValue();
16412 }
16413 // Lower into unpck and perm. Return the perm of this shuffle and replace
16414 // the other.
16415 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16416 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16417 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16418 DAG.getTargetConstant(0x20, DL, MVT::i8));
16419 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16420 DAG.getTargetConstant(0x31, DL, MVT::i8));
16421 if (IsFirstHalf) {
16422 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16423 return Perm1;
16424 }
16425 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16426 return Perm2;
16427}
16428
16429/// Handle lowering of 4-lane 64-bit floating point shuffles.
16430///
16431/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16432/// isn't available.
16434 const APInt &Zeroable, SDValue V1, SDValue V2,
16435 const X86Subtarget &Subtarget,
16436 SelectionDAG &DAG) {
16437 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16438 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16439 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16440
16441 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16442 Subtarget, DAG))
16443 return V;
16444
16445 if (V2.isUndef()) {
16446 // Check for being able to broadcast a single element.
16447 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16448 Mask, Subtarget, DAG))
16449 return Broadcast;
16450
16451 // Use low duplicate instructions for masks that match their pattern.
16452 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16453 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16454
16455 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16456 // Non-half-crossing single input shuffles can be lowered with an
16457 // interleaved permutation.
16458 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16459 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16460 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16461 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16462 }
16463
16464 // With AVX2 we have direct support for this permutation.
16465 if (Subtarget.hasAVX2())
16466 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16467 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16468
16469 // Try to create an in-lane repeating shuffle mask and then shuffle the
16470 // results into the target lanes.
16472 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16473 return V;
16474
16475 // Try to permute the lanes and then use a per-lane permute.
16476 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16477 Mask, DAG, Subtarget))
16478 return V;
16479
16480 // Otherwise, fall back.
16481 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16482 DAG, Subtarget);
16483 }
16484
16485 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16486 Zeroable, Subtarget, DAG))
16487 return Blend;
16488
16489 // Use dedicated unpack instructions for masks that match their pattern.
16490 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16491 return V;
16492
16493 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16494 Zeroable, Subtarget, DAG))
16495 return Op;
16496
16497 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16498 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16499 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16500 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16501
16502 // If we have lane crossing shuffles AND they don't all come from the lower
16503 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16504 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16505 // canonicalize to a blend of splat which isn't necessary for this combine.
16506 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16507 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16508 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16509 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16510 (!Subtarget.hasAVX2() ||
16511 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16512 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16513
16514 // If we have one input in place, then we can permute the other input and
16515 // blend the result.
16516 if (V1IsInPlace || V2IsInPlace)
16517 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16518 Zeroable, Subtarget, DAG);
16519
16520 // Try to create an in-lane repeating shuffle mask and then shuffle the
16521 // results into the target lanes.
16523 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16524 return V;
16525
16526 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16527 // shuffle. However, if we have AVX2 and either inputs are already in place,
16528 // we will be able to shuffle even across lanes the other input in a single
16529 // instruction so skip this pattern.
16530 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16532 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16533 return V;
16534
16535 // If we have VLX support, we can use VEXPAND.
16536 if (Subtarget.hasVLX())
16537 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16538 Zeroable, Subtarget, DAG))
16539 return V;
16540
16541 // If we have AVX2 then we always want to lower with a blend because an v4 we
16542 // can fully permute the elements.
16543 if (Subtarget.hasAVX2())
16544 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16545 Zeroable, Subtarget, DAG);
16546
16547 // Otherwise fall back on generic lowering.
16548 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16549 Subtarget, DAG);
16550}
16551
16552/// Handle lowering of 4-lane 64-bit integer shuffles.
16553///
16554/// This routine is only called when we have AVX2 and thus a reasonable
16555/// instruction set for v4i64 shuffling..
16557 const APInt &Zeroable, SDValue V1, SDValue V2,
16558 const X86Subtarget &Subtarget,
16559 SelectionDAG &DAG) {
16560 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16561 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16562 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16563 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16564
16565 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16566 Subtarget, DAG))
16567 return V;
16568
16569 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16570 Zeroable, Subtarget, DAG))
16571 return Blend;
16572
16573 // Check for being able to broadcast a single element.
16574 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16575 Subtarget, DAG))
16576 return Broadcast;
16577
16578 // Try to use shift instructions if fast.
16579 if (Subtarget.preferLowerShuffleAsShift())
16580 if (SDValue Shift =
16581 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16582 Subtarget, DAG, /*BitwiseOnly*/ true))
16583 return Shift;
16584
16585 if (V2.isUndef()) {
16586 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16587 // can use lower latency instructions that will operate on both lanes.
16588 SmallVector<int, 2> RepeatedMask;
16589 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16590 SmallVector<int, 4> PSHUFDMask;
16591 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16592 return DAG.getBitcast(
16593 MVT::v4i64,
16594 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16595 DAG.getBitcast(MVT::v8i32, V1),
16596 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16597 }
16598
16599 // AVX2 provides a direct instruction for permuting a single input across
16600 // lanes.
16601 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16602 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16603 }
16604
16605 // Try to use shift instructions.
16606 if (SDValue Shift =
16607 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16608 DAG, /*BitwiseOnly*/ false))
16609 return Shift;
16610
16611 // If we have VLX support, we can use VALIGN or VEXPAND.
16612 if (Subtarget.hasVLX()) {
16613 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16614 Zeroable, Subtarget, DAG))
16615 return Rotate;
16616
16617 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16618 Zeroable, Subtarget, DAG))
16619 return V;
16620 }
16621
16622 // Try to use PALIGNR.
16623 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16624 Subtarget, DAG))
16625 return Rotate;
16626
16627 // Use dedicated unpack instructions for masks that match their pattern.
16628 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16629 return V;
16630
16631 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16632 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16633
16634 // If we have one input in place, then we can permute the other input and
16635 // blend the result.
16636 if (V1IsInPlace || V2IsInPlace)
16637 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16638 Zeroable, Subtarget, DAG);
16639
16640 // Try to create an in-lane repeating shuffle mask and then shuffle the
16641 // results into the target lanes.
16643 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16644 return V;
16645
16646 // Try to lower to PERMQ(BLENDD(V1,V2)).
16647 if (SDValue V =
16648 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16649 return V;
16650
16651 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16652 // shuffle. However, if we have AVX2 and either inputs are already in place,
16653 // we will be able to shuffle even across lanes the other input in a single
16654 // instruction so skip this pattern.
16655 if (!V1IsInPlace && !V2IsInPlace)
16657 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16658 return Result;
16659
16660 // Otherwise fall back on generic blend lowering.
16661 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16662 Zeroable, Subtarget, DAG);
16663}
16664
16665/// Handle lowering of 8-lane 32-bit floating point shuffles.
16666///
16667/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16668/// isn't available.
16670 const APInt &Zeroable, SDValue V1, SDValue V2,
16671 const X86Subtarget &Subtarget,
16672 SelectionDAG &DAG) {
16673 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16674 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16675 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16676
16677 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16678 Zeroable, Subtarget, DAG))
16679 return Blend;
16680
16681 // Check for being able to broadcast a single element.
16682 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16683 Subtarget, DAG))
16684 return Broadcast;
16685
16686 if (!Subtarget.hasAVX2()) {
16687 SmallVector<int> InLaneMask;
16688 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16689
16690 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16691 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16692 /*SimpleOnly*/ true))
16693 return R;
16694 }
16695 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16696 Zeroable, Subtarget, DAG))
16697 return DAG.getBitcast(MVT::v8f32, ZExt);
16698
16699 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16700 // options to efficiently lower the shuffle.
16701 SmallVector<int, 4> RepeatedMask;
16702 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16703 assert(RepeatedMask.size() == 4 &&
16704 "Repeated masks must be half the mask width!");
16705
16706 // Use even/odd duplicate instructions for masks that match their pattern.
16707 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16708 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16709 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16710 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16711
16712 if (V2.isUndef())
16713 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16714 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16715
16716 // Use dedicated unpack instructions for masks that match their pattern.
16717 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16718 return V;
16719
16720 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16721 // have already handled any direct blends.
16722 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16723 }
16724
16725 // Try to create an in-lane repeating shuffle mask and then shuffle the
16726 // results into the target lanes.
16728 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16729 return V;
16730
16731 // If we have a single input shuffle with different shuffle patterns in the
16732 // two 128-bit lanes use the variable mask to VPERMILPS.
16733 if (V2.isUndef()) {
16734 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16735 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16736 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16737 }
16738 if (Subtarget.hasAVX2()) {
16739 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16740 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16741 }
16742 // Otherwise, fall back.
16743 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16744 DAG, Subtarget);
16745 }
16746
16747 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16748 // shuffle.
16750 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16751 return Result;
16752
16753 // If we have VLX support, we can use VEXPAND.
16754 if (Subtarget.hasVLX())
16755 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16756 Zeroable, Subtarget, DAG))
16757 return V;
16758
16759 // Try to match an interleave of two v8f32s and lower them as unpck and
16760 // permutes using ymms. This needs to go before we try to split the vectors.
16761 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16762 if ((Subtarget.hasAVX2() ||
16765 !Subtarget.hasAVX512())
16766 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16767 Mask, DAG))
16768 return V;
16769
16770 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16771 // since after split we get a more efficient code using vpunpcklwd and
16772 // vpunpckhwd instrs than vblend.
16773 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16774 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16775 Subtarget, DAG);
16776
16777 // If we have AVX2 then we always want to lower with a blend because at v8 we
16778 // can fully permute the elements.
16779 if (Subtarget.hasAVX2())
16780 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16781 Zeroable, Subtarget, DAG);
16782
16783 // Otherwise fall back on generic lowering.
16784 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16785 Subtarget, DAG);
16786}
16787
16788/// Handle lowering of 8-lane 32-bit integer shuffles.
16789///
16790/// This routine is only called when we have AVX2 and thus a reasonable
16791/// instruction set for v8i32 shuffling..
16793 const APInt &Zeroable, SDValue V1, SDValue V2,
16794 const X86Subtarget &Subtarget,
16795 SelectionDAG &DAG) {
16796 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16797 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16798 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16799 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16800
16801 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16802
16803 // Whenever we can lower this as a zext, that instruction is strictly faster
16804 // than any alternative. It also allows us to fold memory operands into the
16805 // shuffle in many cases.
16806 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16807 Zeroable, Subtarget, DAG))
16808 return ZExt;
16809
16810 // Try to match an interleave of two v8i32s and lower them as unpck and
16811 // permutes using ymms. This needs to go before we try to split the vectors.
16812 if (!Subtarget.hasAVX512())
16813 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16814 Mask, DAG))
16815 return V;
16816
16817 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16818 // since after split we get a more efficient code than vblend by using
16819 // vpunpcklwd and vpunpckhwd instrs.
16820 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16821 !Subtarget.hasAVX512())
16822 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16823 Subtarget, DAG);
16824
16825 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16826 Zeroable, Subtarget, DAG))
16827 return Blend;
16828
16829 // Check for being able to broadcast a single element.
16830 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16831 Subtarget, DAG))
16832 return Broadcast;
16833
16834 // Try to use shift instructions if fast.
16835 if (Subtarget.preferLowerShuffleAsShift()) {
16836 if (SDValue Shift =
16837 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16838 Subtarget, DAG, /*BitwiseOnly*/ true))
16839 return Shift;
16840 if (NumV2Elements == 0)
16841 if (SDValue Rotate =
16842 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16843 return Rotate;
16844 }
16845
16846 // If the shuffle mask is repeated in each 128-bit lane we can use more
16847 // efficient instructions that mirror the shuffles across the two 128-bit
16848 // lanes.
16849 SmallVector<int, 4> RepeatedMask;
16850 bool Is128BitLaneRepeatedShuffle =
16851 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16852 if (Is128BitLaneRepeatedShuffle) {
16853 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16854 if (V2.isUndef())
16855 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16856 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16857
16858 // Use dedicated unpack instructions for masks that match their pattern.
16859 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16860 return V;
16861 }
16862
16863 // Try to use shift instructions.
16864 if (SDValue Shift =
16865 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16866 DAG, /*BitwiseOnly*/ false))
16867 return Shift;
16868
16869 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16870 if (SDValue Rotate =
16871 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16872 return Rotate;
16873
16874 // If we have VLX support, we can use VALIGN or EXPAND.
16875 if (Subtarget.hasVLX()) {
16876 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16877 Zeroable, Subtarget, DAG))
16878 return Rotate;
16879
16880 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16881 Zeroable, Subtarget, DAG))
16882 return V;
16883 }
16884
16885 // Try to use byte rotation instructions.
16886 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16887 Subtarget, DAG))
16888 return Rotate;
16889
16890 // Try to create an in-lane repeating shuffle mask and then shuffle the
16891 // results into the target lanes.
16893 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16894 return V;
16895
16896 if (V2.isUndef()) {
16897 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16898 // because that should be faster than the variable permute alternatives.
16899 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16900 return V;
16901
16902 // If the shuffle patterns aren't repeated but it's a single input, directly
16903 // generate a cross-lane VPERMD instruction.
16904 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16905 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16906 }
16907
16908 // Assume that a single SHUFPS is faster than an alternative sequence of
16909 // multiple instructions (even if the CPU has a domain penalty).
16910 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16911 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16912 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16913 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16914 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16915 CastV1, CastV2, DAG);
16916 return DAG.getBitcast(MVT::v8i32, ShufPS);
16917 }
16918
16919 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16920 // shuffle.
16922 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16923 return Result;
16924
16925 // Otherwise fall back on generic blend lowering.
16926 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16927 Zeroable, Subtarget, DAG);
16928}
16929
16930/// Handle lowering of 16-lane 16-bit integer shuffles.
16931///
16932/// This routine is only called when we have AVX2 and thus a reasonable
16933/// instruction set for v16i16 shuffling..
16935 const APInt &Zeroable, SDValue V1, SDValue V2,
16936 const X86Subtarget &Subtarget,
16937 SelectionDAG &DAG) {
16938 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16939 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16940 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16941 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16942
16943 // Whenever we can lower this as a zext, that instruction is strictly faster
16944 // than any alternative. It also allows us to fold memory operands into the
16945 // shuffle in many cases.
16947 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16948 return ZExt;
16949
16950 // Check for being able to broadcast a single element.
16951 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16952 Subtarget, DAG))
16953 return Broadcast;
16954
16955 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16956 Zeroable, Subtarget, DAG))
16957 return Blend;
16958
16959 // Use dedicated unpack instructions for masks that match their pattern.
16960 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16961 return V;
16962
16963 // Use dedicated pack instructions for masks that match their pattern.
16964 if (SDValue V =
16965 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16966 return V;
16967
16968 // Try to use lower using a truncation.
16969 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16970 Subtarget, DAG))
16971 return V;
16972
16973 // Try to use shift instructions.
16974 if (SDValue Shift =
16975 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16976 Subtarget, DAG, /*BitwiseOnly*/ false))
16977 return Shift;
16978
16979 // Try to use byte rotation instructions.
16980 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16981 Subtarget, DAG))
16982 return Rotate;
16983
16984 // Try to create an in-lane repeating shuffle mask and then shuffle the
16985 // results into the target lanes.
16987 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16988 return V;
16989
16990 if (V2.isUndef()) {
16991 // Try to use bit rotation instructions.
16992 if (SDValue Rotate =
16993 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16994 return Rotate;
16995
16996 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16997 // because that should be faster than the variable permute alternatives.
16998 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16999 return V;
17000
17001 // There are no generalized cross-lane shuffle operations available on i16
17002 // element types.
17003 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17005 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17006 return V;
17007
17008 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17009 DAG, Subtarget);
17010 }
17011
17012 SmallVector<int, 8> RepeatedMask;
17013 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17014 // As this is a single-input shuffle, the repeated mask should be
17015 // a strictly valid v8i16 mask that we can pass through to the v8i16
17016 // lowering to handle even the v16 case.
17018 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17019 }
17020 }
17021
17022 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17023 Zeroable, Subtarget, DAG))
17024 return PSHUFB;
17025
17026 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17027 if (Subtarget.hasBWI())
17028 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17029
17030 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17031 // shuffle.
17033 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17034 return Result;
17035
17036 // Try to permute the lanes and then use a per-lane permute.
17038 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17039 return V;
17040
17041 // Try to match an interleave of two v16i16s and lower them as unpck and
17042 // permutes using ymms.
17043 if (!Subtarget.hasAVX512())
17044 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17045 Mask, DAG))
17046 return V;
17047
17048 // Otherwise fall back on generic lowering.
17049 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17050 Subtarget, DAG);
17051}
17052
17053/// Handle lowering of 32-lane 8-bit integer shuffles.
17054///
17055/// This routine is only called when we have AVX2 and thus a reasonable
17056/// instruction set for v32i8 shuffling..
17058 const APInt &Zeroable, SDValue V1, SDValue V2,
17059 const X86Subtarget &Subtarget,
17060 SelectionDAG &DAG) {
17061 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17062 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17063 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17064 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17065
17066 // Whenever we can lower this as a zext, that instruction is strictly faster
17067 // than any alternative. It also allows us to fold memory operands into the
17068 // shuffle in many cases.
17069 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17070 Zeroable, Subtarget, DAG))
17071 return ZExt;
17072
17073 // Check for being able to broadcast a single element.
17074 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17075 Subtarget, DAG))
17076 return Broadcast;
17077
17078 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17079 Zeroable, Subtarget, DAG))
17080 return Blend;
17081
17082 // Use dedicated unpack instructions for masks that match their pattern.
17083 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17084 return V;
17085
17086 // Use dedicated pack instructions for masks that match their pattern.
17087 if (SDValue V =
17088 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17089 return V;
17090
17091 // Try to use lower using a truncation.
17092 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17093 Subtarget, DAG))
17094 return V;
17095
17096 // Try to use shift instructions.
17097 if (SDValue Shift =
17098 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17099 DAG, /*BitwiseOnly*/ false))
17100 return Shift;
17101
17102 // Try to use byte rotation instructions.
17103 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17104 Subtarget, DAG))
17105 return Rotate;
17106
17107 // Try to use bit rotation instructions.
17108 if (V2.isUndef())
17109 if (SDValue Rotate =
17110 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17111 return Rotate;
17112
17113 // Try to create an in-lane repeating shuffle mask and then shuffle the
17114 // results into the target lanes.
17116 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17117 return V;
17118
17119 // There are no generalized cross-lane shuffle operations available on i8
17120 // element types.
17121 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17122 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17123 // because that should be faster than the variable permute alternatives.
17124 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17125 return V;
17126
17128 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17129 return V;
17130
17131 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17132 DAG, Subtarget);
17133 }
17134
17135 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17136 Zeroable, Subtarget, DAG))
17137 return PSHUFB;
17138
17139 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17140 if (Subtarget.hasVBMI())
17141 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17142
17143 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17144 // shuffle.
17146 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17147 return Result;
17148
17149 // Try to permute the lanes and then use a per-lane permute.
17151 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17152 return V;
17153
17154 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17155 // by zeroable elements in the remaining 24 elements. Turn this into two
17156 // vmovqb instructions shuffled together.
17157 if (Subtarget.hasVLX())
17158 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17159 Mask, Zeroable, DAG))
17160 return V;
17161
17162 // Try to match an interleave of two v32i8s and lower them as unpck and
17163 // permutes using ymms.
17164 if (!Subtarget.hasAVX512())
17165 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17166 Mask, DAG))
17167 return V;
17168
17169 // Otherwise fall back on generic lowering.
17170 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17171 Subtarget, DAG);
17172}
17173
17174/// High-level routine to lower various 256-bit x86 vector shuffles.
17175///
17176/// This routine either breaks down the specific type of a 256-bit x86 vector
17177/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17178/// together based on the available instructions.
17180 SDValue V1, SDValue V2, const APInt &Zeroable,
17181 const X86Subtarget &Subtarget,
17182 SelectionDAG &DAG) {
17183 // If we have a single input to the zero element, insert that into V1 if we
17184 // can do so cheaply.
17185 int NumElts = VT.getVectorNumElements();
17186 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17187
17188 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17190 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17191 return Insertion;
17192
17193 // Handle special cases where the lower or upper half is UNDEF.
17194 if (SDValue V =
17195 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17196 return V;
17197
17198 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17199 // can check for those subtargets here and avoid much of the subtarget
17200 // querying in the per-vector-type lowering routines. With AVX1 we have
17201 // essentially *zero* ability to manipulate a 256-bit vector with integer
17202 // types. Since we'll use floating point types there eventually, just
17203 // immediately cast everything to a float and operate entirely in that domain.
17204 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17205 int ElementBits = VT.getScalarSizeInBits();
17206 if (ElementBits < 32) {
17207 // No floating point type available, if we can't use the bit operations
17208 // for masking/blending then decompose into 128-bit vectors.
17209 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17210 Subtarget, DAG))
17211 return V;
17212 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17213 return V;
17214 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17215 }
17216
17217 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17219 V1 = DAG.getBitcast(FpVT, V1);
17220 V2 = DAG.getBitcast(FpVT, V2);
17221 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17222 }
17223
17224 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17225 V1 = DAG.getBitcast(MVT::v16i16, V1);
17226 V2 = DAG.getBitcast(MVT::v16i16, V2);
17227 return DAG.getBitcast(VT,
17228 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17229 }
17230
17231 switch (VT.SimpleTy) {
17232 case MVT::v4f64:
17233 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17234 case MVT::v4i64:
17235 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17236 case MVT::v8f32:
17237 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17238 case MVT::v8i32:
17239 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17240 case MVT::v16i16:
17241 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17242 case MVT::v32i8:
17243 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17244
17245 default:
17246 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17247 }
17248}
17249
17250/// Try to lower a vector shuffle as a 128-bit shuffles.
17252 const APInt &Zeroable, SDValue V1, SDValue V2,
17253 const X86Subtarget &Subtarget,
17254 SelectionDAG &DAG) {
17255 assert(VT.getScalarSizeInBits() == 64 &&
17256 "Unexpected element type size for 128bit shuffle.");
17257
17258 // To handle 256 bit vector requires VLX and most probably
17259 // function lowerV2X128VectorShuffle() is better solution.
17260 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17261
17262 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17263 SmallVector<int, 4> Widened128Mask;
17264 if (!canWidenShuffleElements(Mask, Widened128Mask))
17265 return SDValue();
17266 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17267
17268 // Try to use an insert into a zero vector.
17269 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17270 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17271 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17272 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17273 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17274 DAG.getVectorIdxConstant(0, DL));
17275 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17276 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17277 DAG.getVectorIdxConstant(0, DL));
17278 }
17279
17280 // Check for patterns which can be matched with a single insert of a 256-bit
17281 // subvector.
17282 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17283 if (OnlyUsesV1 ||
17284 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17285 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17286 SDValue SubVec =
17287 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17288 DAG.getVectorIdxConstant(0, DL));
17289 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17290 DAG.getVectorIdxConstant(4, DL));
17291 }
17292
17293 // See if this is an insertion of the lower 128-bits of V2 into V1.
17294 bool IsInsert = true;
17295 int V2Index = -1;
17296 for (int i = 0; i < 4; ++i) {
17297 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17298 if (Widened128Mask[i] < 0)
17299 continue;
17300
17301 // Make sure all V1 subvectors are in place.
17302 if (Widened128Mask[i] < 4) {
17303 if (Widened128Mask[i] != i) {
17304 IsInsert = false;
17305 break;
17306 }
17307 } else {
17308 // Make sure we only have a single V2 index and its the lowest 128-bits.
17309 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17310 IsInsert = false;
17311 break;
17312 }
17313 V2Index = i;
17314 }
17315 }
17316 if (IsInsert && V2Index >= 0) {
17317 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17318 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17319 DAG.getVectorIdxConstant(0, DL));
17320 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17321 }
17322
17323 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17324 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17325 // possible we at least ensure the lanes stay sequential to help later
17326 // combines.
17327 SmallVector<int, 2> Widened256Mask;
17328 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17329 Widened128Mask.clear();
17330 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17331 }
17332
17333 // Try to lower to vshuf64x2/vshuf32x4.
17334 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17335 int PermMask[4] = {-1, -1, -1, -1};
17336 // Ensure elements came from the same Op.
17337 for (int i = 0; i < 4; ++i) {
17338 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17339 if (Widened128Mask[i] < 0)
17340 continue;
17341
17342 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17343 unsigned OpIndex = i / 2;
17344 if (Ops[OpIndex].isUndef())
17345 Ops[OpIndex] = Op;
17346 else if (Ops[OpIndex] != Op)
17347 return SDValue();
17348
17349 PermMask[i] = Widened128Mask[i] % 4;
17350 }
17351
17352 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17353 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17354}
17355
17356/// Handle lowering of 8-lane 64-bit floating point shuffles.
17358 const APInt &Zeroable, SDValue V1, SDValue V2,
17359 const X86Subtarget &Subtarget,
17360 SelectionDAG &DAG) {
17361 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17362 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17363 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17364
17365 if (V2.isUndef()) {
17366 // Use low duplicate instructions for masks that match their pattern.
17367 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17368 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17369
17370 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17371 // Non-half-crossing single input shuffles can be lowered with an
17372 // interleaved permutation.
17373 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17374 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17375 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17376 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17377 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17378 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17379 }
17380
17381 SmallVector<int, 4> RepeatedMask;
17382 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17383 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17384 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17385 }
17386
17387 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17388 V2, Subtarget, DAG))
17389 return Shuf128;
17390
17391 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17392 return Unpck;
17393
17394 // Check if the blend happens to exactly fit that of SHUFPD.
17395 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17396 Zeroable, Subtarget, DAG))
17397 return Op;
17398
17399 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17400 Subtarget, DAG))
17401 return V;
17402
17403 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17404 Zeroable, Subtarget, DAG))
17405 return Blend;
17406
17407 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17408}
17409
17410/// Handle lowering of 16-lane 32-bit floating point shuffles.
17412 const APInt &Zeroable, SDValue V1, SDValue V2,
17413 const X86Subtarget &Subtarget,
17414 SelectionDAG &DAG) {
17415 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17416 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17417 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17418
17419 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17420 // options to efficiently lower the shuffle.
17421 SmallVector<int, 4> RepeatedMask;
17422 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17423 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17424
17425 // Use even/odd duplicate instructions for masks that match their pattern.
17426 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17427 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17428 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17429 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17430
17431 if (V2.isUndef())
17432 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17433 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17434
17435 // Use dedicated unpack instructions for masks that match their pattern.
17436 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17437 return V;
17438
17439 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17440 Zeroable, Subtarget, DAG))
17441 return Blend;
17442
17443 // Otherwise, fall back to a SHUFPS sequence.
17444 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17445 }
17446
17447 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17448 Zeroable, Subtarget, DAG))
17449 return Blend;
17450
17452 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17453 return DAG.getBitcast(MVT::v16f32, ZExt);
17454
17455 // Try to create an in-lane repeating shuffle mask and then shuffle the
17456 // results into the target lanes.
17458 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17459 return V;
17460
17461 // If we have a single input shuffle with different shuffle patterns in the
17462 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17463 if (V2.isUndef() &&
17464 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17465 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17466 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17467 }
17468
17469 // If we have AVX512F support, we can use VEXPAND.
17470 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17471 Zeroable, Subtarget, DAG))
17472 return V;
17473
17474 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17475}
17476
17477/// Handle lowering of 8-lane 64-bit integer shuffles.
17479 const APInt &Zeroable, SDValue V1, SDValue V2,
17480 const X86Subtarget &Subtarget,
17481 SelectionDAG &DAG) {
17482 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17483 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17484 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17485
17486 // Try to use shift instructions if fast.
17487 if (Subtarget.preferLowerShuffleAsShift())
17488 if (SDValue Shift =
17489 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17490 Subtarget, DAG, /*BitwiseOnly*/ true))
17491 return Shift;
17492
17493 if (V2.isUndef()) {
17494 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17495 // can use lower latency instructions that will operate on all four
17496 // 128-bit lanes.
17497 SmallVector<int, 2> Repeated128Mask;
17498 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17499 SmallVector<int, 4> PSHUFDMask;
17500 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17501 return DAG.getBitcast(
17502 MVT::v8i64,
17503 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17504 DAG.getBitcast(MVT::v16i32, V1),
17505 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17506 }
17507
17508 SmallVector<int, 4> Repeated256Mask;
17509 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17510 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17511 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17512 }
17513
17514 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17515 V2, Subtarget, DAG))
17516 return Shuf128;
17517
17518 // Try to use shift instructions.
17519 if (SDValue Shift =
17520 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17521 DAG, /*BitwiseOnly*/ false))
17522 return Shift;
17523
17524 // Try to use VALIGN.
17525 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17526 Zeroable, Subtarget, DAG))
17527 return Rotate;
17528
17529 // Try to use PALIGNR.
17530 if (Subtarget.hasBWI())
17531 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17532 Subtarget, DAG))
17533 return Rotate;
17534
17535 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17536 return Unpck;
17537
17538 // If we have AVX512F support, we can use VEXPAND.
17539 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17540 Subtarget, DAG))
17541 return V;
17542
17543 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17544 Zeroable, Subtarget, DAG))
17545 return Blend;
17546
17547 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17548}
17549
17550/// Handle lowering of 16-lane 32-bit integer shuffles.
17552 const APInt &Zeroable, SDValue V1, SDValue V2,
17553 const X86Subtarget &Subtarget,
17554 SelectionDAG &DAG) {
17555 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17556 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17557 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17558
17559 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17560
17561 // Whenever we can lower this as a zext, that instruction is strictly faster
17562 // than any alternative. It also allows us to fold memory operands into the
17563 // shuffle in many cases.
17565 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17566 return ZExt;
17567
17568 // Try to use shift instructions if fast.
17569 if (Subtarget.preferLowerShuffleAsShift()) {
17570 if (SDValue Shift =
17571 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17572 Subtarget, DAG, /*BitwiseOnly*/ true))
17573 return Shift;
17574 if (NumV2Elements == 0)
17575 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17576 Subtarget, DAG))
17577 return Rotate;
17578 }
17579
17580 // If the shuffle mask is repeated in each 128-bit lane we can use more
17581 // efficient instructions that mirror the shuffles across the four 128-bit
17582 // lanes.
17583 SmallVector<int, 4> RepeatedMask;
17584 bool Is128BitLaneRepeatedShuffle =
17585 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17586 if (Is128BitLaneRepeatedShuffle) {
17587 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17588 if (V2.isUndef())
17589 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17590 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17591
17592 // Use dedicated unpack instructions for masks that match their pattern.
17593 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17594 return V;
17595 }
17596
17597 // Try to use shift instructions.
17598 if (SDValue Shift =
17599 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17600 Subtarget, DAG, /*BitwiseOnly*/ false))
17601 return Shift;
17602
17603 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17604 if (SDValue Rotate =
17605 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17606 return Rotate;
17607
17608 // Try to use VALIGN.
17609 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17610 Zeroable, Subtarget, DAG))
17611 return Rotate;
17612
17613 // Try to use byte rotation instructions.
17614 if (Subtarget.hasBWI())
17615 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17616 Subtarget, DAG))
17617 return Rotate;
17618
17619 // Assume that a single SHUFPS is faster than using a permv shuffle.
17620 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17621 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17622 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17623 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17624 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17625 CastV1, CastV2, DAG);
17626 return DAG.getBitcast(MVT::v16i32, ShufPS);
17627 }
17628
17629 // Try to create an in-lane repeating shuffle mask and then shuffle the
17630 // results into the target lanes.
17632 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17633 return V;
17634
17635 // If we have AVX512F support, we can use VEXPAND.
17636 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17637 Zeroable, Subtarget, DAG))
17638 return V;
17639
17640 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17641 Zeroable, Subtarget, DAG))
17642 return Blend;
17643
17644 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17645}
17646
17647/// Handle lowering of 32-lane 16-bit integer shuffles.
17649 const APInt &Zeroable, SDValue V1, SDValue V2,
17650 const X86Subtarget &Subtarget,
17651 SelectionDAG &DAG) {
17652 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17653 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17654 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17655 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17656
17657 // Whenever we can lower this as a zext, that instruction is strictly faster
17658 // than any alternative. It also allows us to fold memory operands into the
17659 // shuffle in many cases.
17661 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17662 return ZExt;
17663
17664 // Use dedicated unpack instructions for masks that match their pattern.
17665 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17666 return V;
17667
17668 // Use dedicated pack instructions for masks that match their pattern.
17669 if (SDValue V =
17670 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17671 return V;
17672
17673 // Try to use shift instructions.
17674 if (SDValue Shift =
17675 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17676 Subtarget, DAG, /*BitwiseOnly*/ false))
17677 return Shift;
17678
17679 // Try to use byte rotation instructions.
17680 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17681 Subtarget, DAG))
17682 return Rotate;
17683
17684 if (V2.isUndef()) {
17685 // Try to use bit rotation instructions.
17686 if (SDValue Rotate =
17687 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17688 return Rotate;
17689
17690 SmallVector<int, 8> RepeatedMask;
17691 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17692 // As this is a single-input shuffle, the repeated mask should be
17693 // a strictly valid v8i16 mask that we can pass through to the v8i16
17694 // lowering to handle even the v32 case.
17695 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17696 RepeatedMask, Subtarget, DAG);
17697 }
17698 }
17699
17700 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17701 Zeroable, Subtarget, DAG))
17702 return Blend;
17703
17704 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17705 Zeroable, Subtarget, DAG))
17706 return PSHUFB;
17707
17708 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17709 // shuffle.
17710 if (!V2.isUndef())
17712 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17713 return Result;
17714
17715 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17716}
17717
17718/// Handle lowering of 64-lane 8-bit integer shuffles.
17720 const APInt &Zeroable, SDValue V1, SDValue V2,
17721 const X86Subtarget &Subtarget,
17722 SelectionDAG &DAG) {
17723 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17724 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17725 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17726 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17727
17728 // Whenever we can lower this as a zext, that instruction is strictly faster
17729 // than any alternative. It also allows us to fold memory operands into the
17730 // shuffle in many cases.
17732 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17733 return ZExt;
17734
17735 // Use dedicated unpack instructions for masks that match their pattern.
17736 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17737 return V;
17738
17739 // Use dedicated pack instructions for masks that match their pattern.
17740 if (SDValue V =
17741 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17742 return V;
17743
17744 // Try to use shift instructions.
17745 if (SDValue Shift =
17746 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17747 DAG, /*BitwiseOnly*/ false))
17748 return Shift;
17749
17750 // Try to use byte rotation instructions.
17751 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17752 Subtarget, DAG))
17753 return Rotate;
17754
17755 // Try to use bit rotation instructions.
17756 if (V2.isUndef())
17757 if (SDValue Rotate =
17758 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17759 return Rotate;
17760
17761 // Lower as AND if possible.
17762 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17763 Zeroable, Subtarget, DAG))
17764 return Masked;
17765
17766 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17767 Zeroable, Subtarget, DAG))
17768 return PSHUFB;
17769
17770 // Try to create an in-lane repeating shuffle mask and then shuffle the
17771 // results into the target lanes.
17773 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17774 return V;
17775
17777 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17778 return Result;
17779
17780 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17781 Zeroable, Subtarget, DAG))
17782 return Blend;
17783
17784 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17785 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17786 // PALIGNR will be cheaper than the second PSHUFB+OR.
17787 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17788 Mask, Subtarget, DAG))
17789 return V;
17790
17791 // If we can't directly blend but can use PSHUFB, that will be better as it
17792 // can both shuffle and set up the inefficient blend.
17793 bool V1InUse, V2InUse;
17794 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17795 DAG, V1InUse, V2InUse);
17796 }
17797
17798 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17799 // shuffle.
17800 if (!V2.isUndef())
17802 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17803 return Result;
17804
17805 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17806 if (Subtarget.hasVBMI())
17807 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17808
17809 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17810}
17811
17812/// High-level routine to lower various 512-bit x86 vector shuffles.
17813///
17814/// This routine either breaks down the specific type of a 512-bit x86 vector
17815/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17816/// together based on the available instructions.
17818 MVT VT, SDValue V1, SDValue V2,
17819 const APInt &Zeroable,
17820 const X86Subtarget &Subtarget,
17821 SelectionDAG &DAG) {
17822 assert(Subtarget.hasAVX512() &&
17823 "Cannot lower 512-bit vectors w/ basic ISA!");
17824
17825 // If we have a single input to the zero element, insert that into V1 if we
17826 // can do so cheaply.
17827 int NumElts = Mask.size();
17828 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17829
17830 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17832 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17833 return Insertion;
17834
17835 // Handle special cases where the lower or upper half is UNDEF.
17836 if (SDValue V =
17837 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17838 return V;
17839
17840 // Check for being able to broadcast a single element.
17841 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17842 Subtarget, DAG))
17843 return Broadcast;
17844
17845 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17846 // Try using bit ops for masking and blending before falling back to
17847 // splitting.
17848 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17849 Subtarget, DAG))
17850 return V;
17851 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17852 return V;
17853
17854 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17855 }
17856
17857 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17858 if (!Subtarget.hasBWI())
17859 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17860 /*SimpleOnly*/ false);
17861
17862 V1 = DAG.getBitcast(MVT::v32i16, V1);
17863 V2 = DAG.getBitcast(MVT::v32i16, V2);
17864 return DAG.getBitcast(VT,
17865 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17866 }
17867
17868 // Dispatch to each element type for lowering. If we don't have support for
17869 // specific element type shuffles at 512 bits, immediately split them and
17870 // lower them. Each lowering routine of a given type is allowed to assume that
17871 // the requisite ISA extensions for that element type are available.
17872 switch (VT.SimpleTy) {
17873 case MVT::v8f64:
17874 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17875 case MVT::v16f32:
17876 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17877 case MVT::v8i64:
17878 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17879 case MVT::v16i32:
17880 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17881 case MVT::v32i16:
17882 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17883 case MVT::v64i8:
17884 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17885
17886 default:
17887 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17888 }
17889}
17890
17892 MVT VT, SDValue V1, SDValue V2,
17893 const X86Subtarget &Subtarget,
17894 SelectionDAG &DAG) {
17895 // Shuffle should be unary.
17896 if (!V2.isUndef())
17897 return SDValue();
17898
17899 int ShiftAmt = -1;
17900 int NumElts = Mask.size();
17901 for (int i = 0; i != NumElts; ++i) {
17902 int M = Mask[i];
17903 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17904 "Unexpected mask index.");
17905 if (M < 0)
17906 continue;
17907
17908 // The first non-undef element determines our shift amount.
17909 if (ShiftAmt < 0) {
17910 ShiftAmt = M - i;
17911 // Need to be shifting right.
17912 if (ShiftAmt <= 0)
17913 return SDValue();
17914 }
17915 // All non-undef elements must shift by the same amount.
17916 if (ShiftAmt != M - i)
17917 return SDValue();
17918 }
17919 assert(ShiftAmt >= 0 && "All undef?");
17920
17921 // Great we found a shift right.
17922 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17923 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17924 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17925 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17926 DAG.getVectorIdxConstant(0, DL));
17927}
17928
17929// Determine if this shuffle can be implemented with a KSHIFT instruction.
17930// Returns the shift amount if possible or -1 if not. This is a simplified
17931// version of matchShuffleAsShift.
17932static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17933 int MaskOffset, const APInt &Zeroable) {
17934 int Size = Mask.size();
17935
17936 auto CheckZeros = [&](int Shift, bool Left) {
17937 for (int j = 0; j < Shift; ++j)
17938 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17939 return false;
17940
17941 return true;
17942 };
17943
17944 auto MatchShift = [&](int Shift, bool Left) {
17945 unsigned Pos = Left ? Shift : 0;
17946 unsigned Low = Left ? 0 : Shift;
17947 unsigned Len = Size - Shift;
17948 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17949 };
17950
17951 for (int Shift = 1; Shift != Size; ++Shift)
17952 for (bool Left : {true, false})
17953 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17955 return Shift;
17956 }
17957
17958 return -1;
17959}
17960
17961
17962// Lower vXi1 vector shuffles.
17963// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17964// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17965// vector, shuffle and then truncate it back.
17967 MVT VT, SDValue V1, SDValue V2,
17968 const APInt &Zeroable,
17969 const X86Subtarget &Subtarget,
17970 SelectionDAG &DAG) {
17971 assert(Subtarget.hasAVX512() &&
17972 "Cannot lower 512-bit vectors w/o basic ISA!");
17973
17974 int NumElts = Mask.size();
17975 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17976
17977 // Try to recognize shuffles that are just padding a subvector with zeros.
17978 int SubvecElts = 0;
17979 int Src = -1;
17980 for (int i = 0; i != NumElts; ++i) {
17981 if (Mask[i] >= 0) {
17982 // Grab the source from the first valid mask. All subsequent elements need
17983 // to use this same source.
17984 if (Src < 0)
17985 Src = Mask[i] / NumElts;
17986 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17987 break;
17988 }
17989
17990 ++SubvecElts;
17991 }
17992 assert(SubvecElts != NumElts && "Identity shuffle?");
17993
17994 // Clip to a power 2.
17995 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17996
17997 // Make sure the number of zeroable bits in the top at least covers the bits
17998 // not covered by the subvector.
17999 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18000 assert(Src >= 0 && "Expected a source!");
18001 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18002 SDValue Extract =
18003 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18004 DAG.getVectorIdxConstant(0, DL));
18005 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18006 DAG.getConstant(0, DL, VT), Extract,
18007 DAG.getVectorIdxConstant(0, DL));
18008 }
18009
18010 // Try a simple shift right with undef elements. Later we'll try with zeros.
18011 if (SDValue Shift =
18012 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18013 return Shift;
18014
18015 // Try to match KSHIFTs.
18016 unsigned Offset = 0;
18017 for (SDValue V : {V1, V2}) {
18018 unsigned Opcode;
18019 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18020 if (ShiftAmt >= 0) {
18021 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18022 MVT WideVT = Res.getSimpleValueType();
18023 // Widened right shifts need two shifts to ensure we shift in zeroes.
18024 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18025 int WideElts = WideVT.getVectorNumElements();
18026 // Shift left to put the original vector in the MSBs of the new size.
18027 Res =
18028 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18029 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18030 // Increase the shift amount to account for the left shift.
18031 ShiftAmt += WideElts - NumElts;
18032 }
18033
18034 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18035 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18036 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18037 DAG.getVectorIdxConstant(0, DL));
18038 }
18039 Offset += NumElts; // Increment for next iteration.
18040 }
18041
18042 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18043 // ops instead.
18044 // TODO: What other unary shuffles would benefit from this?
18045 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18046 SDValue Op0 = V1.getOperand(0);
18047 SDValue Op1 = V1.getOperand(1);
18048 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
18049 EVT OpVT = Op0.getValueType();
18050 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18051 return DAG.getSetCC(
18052 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18053 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18054 }
18055
18056 MVT ExtVT;
18057 switch (VT.SimpleTy) {
18058 default:
18059 llvm_unreachable("Expected a vector of i1 elements");
18060 case MVT::v2i1:
18061 ExtVT = MVT::v2i64;
18062 break;
18063 case MVT::v4i1:
18064 ExtVT = MVT::v4i32;
18065 break;
18066 case MVT::v8i1:
18067 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18068 // shuffle.
18069 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18070 break;
18071 case MVT::v16i1:
18072 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18073 // 256-bit operation available.
18074 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18075 break;
18076 case MVT::v32i1:
18077 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18078 // 256-bit operation available.
18079 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18080 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18081 break;
18082 case MVT::v64i1:
18083 // Fall back to scalarization. FIXME: We can do better if the shuffle
18084 // can be partitioned cleanly.
18085 if (!Subtarget.useBWIRegs())
18086 return SDValue();
18087 ExtVT = MVT::v64i8;
18088 break;
18089 }
18090
18091 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18092 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18093
18094 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18095 // i1 was sign extended we can use X86ISD::CVT2MASK.
18096 int NumElems = VT.getVectorNumElements();
18097 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18098 (Subtarget.hasDQI() && (NumElems < 32)))
18099 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18100 Shuffle, ISD::SETGT);
18101
18102 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18103}
18104
18105/// Helper function that returns true if the shuffle mask should be
18106/// commuted to improve canonicalization.
18108 int NumElements = Mask.size();
18109
18110 int NumV1Elements = 0, NumV2Elements = 0;
18111 for (int M : Mask)
18112 if (M < 0)
18113 continue;
18114 else if (M < NumElements)
18115 ++NumV1Elements;
18116 else
18117 ++NumV2Elements;
18118
18119 // Commute the shuffle as needed such that more elements come from V1 than
18120 // V2. This allows us to match the shuffle pattern strictly on how many
18121 // elements come from V1 without handling the symmetric cases.
18122 if (NumV2Elements > NumV1Elements)
18123 return true;
18124
18125 assert(NumV1Elements > 0 && "No V1 indices");
18126
18127 if (NumV2Elements == 0)
18128 return false;
18129
18130 // When the number of V1 and V2 elements are the same, try to minimize the
18131 // number of uses of V2 in the low half of the vector. When that is tied,
18132 // ensure that the sum of indices for V1 is equal to or lower than the sum
18133 // indices for V2. When those are equal, try to ensure that the number of odd
18134 // indices for V1 is lower than the number of odd indices for V2.
18135 if (NumV1Elements == NumV2Elements) {
18136 int LowV1Elements = 0, LowV2Elements = 0;
18137 for (int M : Mask.slice(0, NumElements / 2))
18138 if (M >= NumElements)
18139 ++LowV2Elements;
18140 else if (M >= 0)
18141 ++LowV1Elements;
18142 if (LowV2Elements > LowV1Elements)
18143 return true;
18144 if (LowV2Elements == LowV1Elements) {
18145 int SumV1Indices = 0, SumV2Indices = 0;
18146 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18147 if (Mask[i] >= NumElements)
18148 SumV2Indices += i;
18149 else if (Mask[i] >= 0)
18150 SumV1Indices += i;
18151 if (SumV2Indices < SumV1Indices)
18152 return true;
18153 if (SumV2Indices == SumV1Indices) {
18154 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18155 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18156 if (Mask[i] >= NumElements)
18157 NumV2OddIndices += i % 2;
18158 else if (Mask[i] >= 0)
18159 NumV1OddIndices += i % 2;
18160 if (NumV2OddIndices < NumV1OddIndices)
18161 return true;
18162 }
18163 }
18164 }
18165
18166 return false;
18167}
18168
18170 const X86Subtarget &Subtarget) {
18171 if (!Subtarget.hasAVX512())
18172 return false;
18173
18174 if (!V.getValueType().isSimple())
18175 return false;
18176
18177 MVT VT = V.getSimpleValueType().getScalarType();
18178 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18179 return false;
18180
18181 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18182 // are preferable to blendw/blendvb/masked-mov.
18183 if ((VT == MVT::i16 || VT == MVT::i8) &&
18184 V.getSimpleValueType().getSizeInBits() < 512)
18185 return false;
18186
18187 auto HasMaskOperation = [&](SDValue V) {
18188 // TODO: Currently we only check limited opcode. We probably extend
18189 // it to all binary operation by checking TLI.isBinOp().
18190 switch (V->getOpcode()) {
18191 default:
18192 return false;
18193 case ISD::ADD:
18194 case ISD::SUB:
18195 case ISD::AND:
18196 case ISD::XOR:
18197 case ISD::OR:
18198 case ISD::SMAX:
18199 case ISD::SMIN:
18200 case ISD::UMAX:
18201 case ISD::UMIN:
18202 case ISD::ABS:
18203 case ISD::SHL:
18204 case ISD::SRL:
18205 case ISD::SRA:
18206 case ISD::MUL:
18207 break;
18208 }
18209 if (!V->hasOneUse())
18210 return false;
18211
18212 return true;
18213 };
18214
18215 if (HasMaskOperation(V))
18216 return true;
18217
18218 return false;
18219}
18220
18221// Forward declaration.
18224 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18225 const X86Subtarget &Subtarget);
18226
18227 /// Top-level lowering for x86 vector shuffles.
18228///
18229/// This handles decomposition, canonicalization, and lowering of all x86
18230/// vector shuffles. Most of the specific lowering strategies are encapsulated
18231/// above in helper routines. The canonicalization attempts to widen shuffles
18232/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18233/// s.t. only one of the two inputs needs to be tested, etc.
18235 SelectionDAG &DAG) {
18236 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18237 ArrayRef<int> OrigMask = SVOp->getMask();
18238 SDValue V1 = Op.getOperand(0);
18239 SDValue V2 = Op.getOperand(1);
18240 MVT VT = Op.getSimpleValueType();
18241 int NumElements = VT.getVectorNumElements();
18242 SDLoc DL(Op);
18243 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18244
18245 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18246 "Can't lower MMX shuffles");
18247
18248 bool V1IsUndef = V1.isUndef();
18249 bool V2IsUndef = V2.isUndef();
18250 if (V1IsUndef && V2IsUndef)
18251 return DAG.getUNDEF(VT);
18252
18253 // When we create a shuffle node we put the UNDEF node to second operand,
18254 // but in some cases the first operand may be transformed to UNDEF.
18255 // In this case we should just commute the node.
18256 if (V1IsUndef)
18257 return DAG.getCommutedVectorShuffle(*SVOp);
18258
18259 // Check for non-undef masks pointing at an undef vector and make the masks
18260 // undef as well. This makes it easier to match the shuffle based solely on
18261 // the mask.
18262 if (V2IsUndef &&
18263 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18264 SmallVector<int, 8> NewMask(OrigMask);
18265 for (int &M : NewMask)
18266 if (M >= NumElements)
18267 M = -1;
18268 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18269 }
18270
18271 // Check for illegal shuffle mask element index values.
18272 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18273 (void)MaskUpperLimit;
18274 assert(llvm::all_of(OrigMask,
18275 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18276 "Out of bounds shuffle index");
18277
18278 // We actually see shuffles that are entirely re-arrangements of a set of
18279 // zero inputs. This mostly happens while decomposing complex shuffles into
18280 // simple ones. Directly lower these as a buildvector of zeros.
18281 APInt KnownUndef, KnownZero;
18282 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18283
18284 APInt Zeroable = KnownUndef | KnownZero;
18285 if (Zeroable.isAllOnes())
18286 return getZeroVector(VT, Subtarget, DAG, DL);
18287
18288 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18289
18290 // Try to collapse shuffles into using a vector type with fewer elements but
18291 // wider element types. We cap this to not form integers or floating point
18292 // elements wider than 64 bits. It does not seem beneficial to form i128
18293 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18294 SmallVector<int, 16> WidenedMask;
18295 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18296 !canCombineAsMaskOperation(V1, Subtarget) &&
18297 !canCombineAsMaskOperation(V2, Subtarget) &&
18298 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18299 // Shuffle mask widening should not interfere with a broadcast opportunity
18300 // by obfuscating the operands with bitcasts.
18301 // TODO: Avoid lowering directly from this top-level function: make this
18302 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18303 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18304 Subtarget, DAG))
18305 return Broadcast;
18306
18307 MVT NewEltVT = VT.isFloatingPoint()
18310 int NewNumElts = NumElements / 2;
18311 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18312 // Make sure that the new vector type is legal. For example, v2f64 isn't
18313 // legal on SSE1.
18314 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18315 if (V2IsZero) {
18316 // Modify the new Mask to take all zeros from the all-zero vector.
18317 // Choose indices that are blend-friendly.
18318 bool UsedZeroVector = false;
18319 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18320 "V2's non-undef elements are used?!");
18321 for (int i = 0; i != NewNumElts; ++i)
18322 if (WidenedMask[i] == SM_SentinelZero) {
18323 WidenedMask[i] = i + NewNumElts;
18324 UsedZeroVector = true;
18325 }
18326 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18327 // some elements to be undef.
18328 if (UsedZeroVector)
18329 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18330 }
18331 V1 = DAG.getBitcast(NewVT, V1);
18332 V2 = DAG.getBitcast(NewVT, V2);
18333 return DAG.getBitcast(
18334 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18335 }
18336 }
18337
18338 SmallVector<SDValue> Ops = {V1, V2};
18339 SmallVector<int> Mask(OrigMask);
18340
18341 // Canonicalize the shuffle with any horizontal ops inputs.
18342 // NOTE: This may update Ops and Mask.
18344 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18345 return DAG.getBitcast(VT, HOp);
18346
18347 V1 = DAG.getBitcast(VT, Ops[0]);
18348 V2 = DAG.getBitcast(VT, Ops[1]);
18349 assert(NumElements == (int)Mask.size() &&
18350 "canonicalizeShuffleMaskWithHorizOp "
18351 "shouldn't alter the shuffle mask size");
18352
18353 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18354 // These will be materialized uniformly anyway, so make splat matching easier.
18355 // TODO: Allow all int constants?
18356 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18357 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18358 BitVector Undefs;
18359 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18360 if (Undefs.any() &&
18362 isa<ConstantFPSDNode>(Splat))) {
18363 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18364 }
18365 }
18366 }
18367 return V;
18368 };
18369 V1 = CanonicalizeConstant(V1);
18370 V2 = CanonicalizeConstant(V2);
18371
18372 // Commute the shuffle if it will improve canonicalization.
18375 std::swap(V1, V2);
18376 }
18377
18378 // For each vector width, delegate to a specialized lowering routine.
18379 if (VT.is128BitVector())
18380 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18381
18382 if (VT.is256BitVector())
18383 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18384
18385 if (VT.is512BitVector())
18386 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18387
18388 if (Is1BitVector)
18389 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18390
18391 llvm_unreachable("Unimplemented!");
18392}
18393
18394// As legal vpcompress instructions depend on various AVX512 extensions, try to
18395// convert illegal vector sizes to legal ones to avoid expansion.
18397 SelectionDAG &DAG) {
18398 assert(Subtarget.hasAVX512() &&
18399 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18400
18401 SDLoc DL(Op);
18402 SDValue Vec = Op.getOperand(0);
18403 SDValue Mask = Op.getOperand(1);
18404 SDValue Passthru = Op.getOperand(2);
18405
18406 EVT VecVT = Vec.getValueType();
18407 EVT ElementVT = VecVT.getVectorElementType();
18408 unsigned NumElements = VecVT.getVectorNumElements();
18409 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18410 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18411
18412 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18413 // compressed as 512-bit vectors in AVX512F.
18414 if (NumVecBits != 128 && NumVecBits != 256)
18415 return SDValue();
18416
18417 if (NumElementBits == 32 || NumElementBits == 64) {
18418 unsigned NumLargeElements = 512 / NumElementBits;
18419 MVT LargeVecVT =
18420 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18421 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18422
18423 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18424 DAG, DL);
18425 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18426 Subtarget, DAG, DL);
18427 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18428 : widenSubVector(LargeVecVT, Passthru,
18429 /*ZeroNewElements=*/false,
18430 Subtarget, DAG, DL);
18431
18432 SDValue Compressed =
18433 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18434 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18435 DAG.getConstant(0, DL, MVT::i64));
18436 }
18437
18438 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18439 VecVT == MVT::v16i16) {
18440 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18441 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18442
18443 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18444 Passthru = Passthru.isUndef()
18445 ? DAG.getUNDEF(LargeVecVT)
18446 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18447
18448 SDValue Compressed =
18449 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18450 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18451 }
18452
18453 return SDValue();
18454}
18455
18456/// Try to lower a VSELECT instruction to a vector shuffle.
18458 const X86Subtarget &Subtarget,
18459 SelectionDAG &DAG) {
18460 SDValue Cond = Op.getOperand(0);
18461 SDValue LHS = Op.getOperand(1);
18462 SDValue RHS = Op.getOperand(2);
18463 MVT VT = Op.getSimpleValueType();
18464
18465 // Only non-legal VSELECTs reach this lowering, convert those into generic
18466 // shuffles and re-use the shuffle lowering path for blends.
18470 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18471 }
18472
18473 return SDValue();
18474}
18475
18476SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18477 SDValue Cond = Op.getOperand(0);
18478 SDValue LHS = Op.getOperand(1);
18479 SDValue RHS = Op.getOperand(2);
18480
18481 SDLoc dl(Op);
18482 MVT VT = Op.getSimpleValueType();
18483 if (isSoftF16(VT, Subtarget)) {
18485 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18486 DAG.getBitcast(NVT, LHS),
18487 DAG.getBitcast(NVT, RHS)));
18488 }
18489
18490 // A vselect where all conditions and data are constants can be optimized into
18491 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18495 return SDValue();
18496
18497 // Try to lower this to a blend-style vector shuffle. This can handle all
18498 // constant condition cases.
18499 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18500 return BlendOp;
18501
18502 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18503 // with patterns on the mask registers on AVX-512.
18504 MVT CondVT = Cond.getSimpleValueType();
18505 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18506 if (CondEltSize == 1)
18507 return Op;
18508
18509 // Variable blends are only legal from SSE4.1 onward.
18510 if (!Subtarget.hasSSE41())
18511 return SDValue();
18512
18513 unsigned EltSize = VT.getScalarSizeInBits();
18514 unsigned NumElts = VT.getVectorNumElements();
18515
18516 // Expand v32i16/v64i8 without BWI.
18517 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18518 return SDValue();
18519
18520 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18521 // into an i1 condition so that we can use the mask-based 512-bit blend
18522 // instructions.
18523 if (VT.getSizeInBits() == 512) {
18524 // Build a mask by testing the condition against zero.
18525 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18526 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18527 DAG.getConstant(0, dl, CondVT),
18528 ISD::SETNE);
18529 // Now return a new VSELECT using the mask.
18530 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18531 }
18532
18533 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18534 if (CondEltSize != EltSize) {
18535 // If we don't have a sign splat, rely on the expansion.
18536 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18537 return SDValue();
18538
18539 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18540 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18541 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18542 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18543 }
18544
18545 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18546 // are free to split, then better to split before expanding the
18547 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18548 // TODO: This is very similar to narrowVectorSelect.
18549 // TODO: Add Load splitting to isFreeToSplitVector ?
18550 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18551 !Subtarget.hasXOP()) {
18552 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18553 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18554 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18555 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18556 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18557 if (FreeCond && (FreeLHS || FreeRHS))
18558 return splitVectorOp(Op, DAG, dl);
18559 }
18560
18561 // Only some types will be legal on some subtargets. If we can emit a legal
18562 // VSELECT-matching blend, return Op, and but if we need to expand, return
18563 // a null value.
18564 switch (VT.SimpleTy) {
18565 default:
18566 // Most of the vector types have blends past SSE4.1.
18567 return Op;
18568
18569 case MVT::v32i8:
18570 // The byte blends for AVX vectors were introduced only in AVX2.
18571 if (Subtarget.hasAVX2())
18572 return Op;
18573
18574 return SDValue();
18575
18576 case MVT::v8i16:
18577 case MVT::v16i16:
18578 case MVT::v8f16:
18579 case MVT::v16f16: {
18580 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18581 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18582 Cond = DAG.getBitcast(CastVT, Cond);
18583 LHS = DAG.getBitcast(CastVT, LHS);
18584 RHS = DAG.getBitcast(CastVT, RHS);
18585 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18586 return DAG.getBitcast(VT, Select);
18587 }
18588 }
18589}
18590
18592 MVT VT = Op.getSimpleValueType();
18593 SDValue Vec = Op.getOperand(0);
18594 SDValue Idx = Op.getOperand(1);
18595 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18596 SDLoc dl(Op);
18597
18599 return SDValue();
18600
18601 if (VT.getSizeInBits() == 8) {
18602 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18603 // we're going to zero extend the register or fold the store.
18606 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18607 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18608 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18609
18610 unsigned IdxVal = Idx->getAsZExtVal();
18611 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18612 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18613 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18614 }
18615
18616 if (VT == MVT::f32) {
18617 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18618 // the result back to FR32 register. It's only worth matching if the
18619 // result has a single use which is a store or a bitcast to i32. And in
18620 // the case of a store, it's not worth it if the index is a constant 0,
18621 // because a MOVSSmr can be used instead, which is smaller and faster.
18622 if (!Op.hasOneUse())
18623 return SDValue();
18624 SDNode *User = *Op.getNode()->user_begin();
18625 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18626 (User->getOpcode() != ISD::BITCAST ||
18627 User->getValueType(0) != MVT::i32))
18628 return SDValue();
18629 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18630 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18631 return DAG.getBitcast(MVT::f32, Extract);
18632 }
18633
18634 if (VT == MVT::i32 || VT == MVT::i64)
18635 return Op;
18636
18637 return SDValue();
18638}
18639
18640/// Extract one bit from mask vector, like v16i1 or v8i1.
18641/// AVX-512 feature.
18643 const X86Subtarget &Subtarget) {
18644 SDValue Vec = Op.getOperand(0);
18645 SDLoc dl(Vec);
18646 MVT VecVT = Vec.getSimpleValueType();
18647 SDValue Idx = Op.getOperand(1);
18648 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18649 MVT EltVT = Op.getSimpleValueType();
18650
18651 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18652 "Unexpected vector type in ExtractBitFromMaskVector");
18653
18654 // variable index can't be handled in mask registers,
18655 // extend vector to VR512/128
18656 if (!IdxC) {
18657 unsigned NumElts = VecVT.getVectorNumElements();
18658 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18659 // than extending to 128/256bit.
18660 if (NumElts == 1) {
18661 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18663 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18664 }
18665 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18666 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18667 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18668 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18669 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18670 }
18671
18672 unsigned IdxVal = IdxC->getZExtValue();
18673 if (IdxVal == 0) // the operation is legal
18674 return Op;
18675
18676 // Extend to natively supported kshift.
18677 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18678
18679 // Use kshiftr instruction to move to the lower element.
18680 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18681 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18682
18683 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18684 DAG.getVectorIdxConstant(0, dl));
18685}
18686
18687// Helper to find all the extracted elements from a vector.
18689 MVT VT = N->getSimpleValueType(0);
18690 unsigned NumElts = VT.getVectorNumElements();
18691 APInt DemandedElts = APInt::getZero(NumElts);
18692 for (SDNode *User : N->users()) {
18693 switch (User->getOpcode()) {
18694 case X86ISD::PEXTRB:
18695 case X86ISD::PEXTRW:
18697 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18698 DemandedElts.setAllBits();
18699 return DemandedElts;
18700 }
18701 DemandedElts.setBit(User->getConstantOperandVal(1));
18702 break;
18703 case ISD::BITCAST: {
18704 if (!User->getValueType(0).isSimple() ||
18705 !User->getValueType(0).isVector()) {
18706 DemandedElts.setAllBits();
18707 return DemandedElts;
18708 }
18709 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18710 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18711 break;
18712 }
18713 default:
18714 DemandedElts.setAllBits();
18715 return DemandedElts;
18716 }
18717 }
18718 return DemandedElts;
18719}
18720
18721SDValue
18722X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18723 SelectionDAG &DAG) const {
18724 SDLoc dl(Op);
18725 SDValue Vec = Op.getOperand(0);
18726 MVT VecVT = Vec.getSimpleValueType();
18727 SDValue Idx = Op.getOperand(1);
18728 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18729
18730 if (VecVT.getVectorElementType() == MVT::i1)
18731 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18732
18733 if (!IdxC) {
18734 // Its more profitable to go through memory (1 cycles throughput)
18735 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18736 // IACA tool was used to get performance estimation
18737 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18738 //
18739 // example : extractelement <16 x i8> %a, i32 %i
18740 //
18741 // Block Throughput: 3.00 Cycles
18742 // Throughput Bottleneck: Port5
18743 //
18744 // | Num Of | Ports pressure in cycles | |
18745 // | Uops | 0 - DV | 5 | 6 | 7 | |
18746 // ---------------------------------------------
18747 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18748 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18749 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18750 // Total Num Of Uops: 4
18751 //
18752 //
18753 // Block Throughput: 1.00 Cycles
18754 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18755 //
18756 // | | Ports pressure in cycles | |
18757 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18758 // ---------------------------------------------------------
18759 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18760 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18761 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18762 // Total Num Of Uops: 4
18763
18764 return SDValue();
18765 }
18766
18767 unsigned IdxVal = IdxC->getZExtValue();
18768
18769 // If this is a 256-bit vector result, first extract the 128-bit vector and
18770 // then extract the element from the 128-bit vector.
18771 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18772 // Get the 128-bit vector.
18773 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18774 MVT EltVT = VecVT.getVectorElementType();
18775
18776 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18777 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18778
18779 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18780 // this can be done with a mask.
18781 IdxVal &= ElemsPerChunk - 1;
18782 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18783 DAG.getVectorIdxConstant(IdxVal, dl));
18784 }
18785
18786 assert(VecVT.is128BitVector() && "Unexpected vector length");
18787
18788 MVT VT = Op.getSimpleValueType();
18789
18790 if (VT == MVT::i16) {
18791 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18792 // we're going to zero extend the register or fold the store (SSE41 only).
18793 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18794 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18795 if (Subtarget.hasFP16())
18796 return Op;
18797
18798 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18799 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18800 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18801 }
18802
18803 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18804 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18805 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18806 }
18807
18808 if (Subtarget.hasSSE41())
18809 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18810 return Res;
18811
18812 // Only extract a single element from a v16i8 source - determine the common
18813 // DWORD/WORD that all extractions share, and extract the sub-byte.
18814 // TODO: Add QWORD MOVQ extraction?
18815 if (VT == MVT::i8) {
18816 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18817 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18818
18819 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18820 int DWordIdx = IdxVal / 4;
18821 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18822 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18823 DAG.getBitcast(MVT::v4i32, Vec),
18824 DAG.getVectorIdxConstant(DWordIdx, dl));
18825 int ShiftVal = (IdxVal % 4) * 8;
18826 if (ShiftVal != 0)
18827 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18828 DAG.getConstant(ShiftVal, dl, MVT::i8));
18829 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18830 }
18831
18832 int WordIdx = IdxVal / 2;
18833 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18834 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18835 DAG.getBitcast(MVT::v8i16, Vec),
18836 DAG.getVectorIdxConstant(WordIdx, dl));
18837 int ShiftVal = (IdxVal % 2) * 8;
18838 if (ShiftVal != 0)
18839 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18840 DAG.getConstant(ShiftVal, dl, MVT::i8));
18841 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18842 }
18843 }
18844
18845 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18846 if (IdxVal == 0)
18847 return Op;
18848
18849 // Shuffle the element to the lowest element, then movss or movsh.
18851 Mask[0] = static_cast<int>(IdxVal);
18852 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18853 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18854 DAG.getVectorIdxConstant(0, dl));
18855 }
18856
18857 if (VT.getSizeInBits() == 64) {
18858 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18859 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18860 // to match extract_elt for f64.
18861 if (IdxVal == 0)
18862 return Op;
18863
18864 // UNPCKHPD the element to the lowest double word, then movsd.
18865 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18866 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18867 int Mask[2] = { 1, -1 };
18868 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18869 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18870 DAG.getVectorIdxConstant(0, dl));
18871 }
18872
18873 return SDValue();
18874}
18875
18876/// Insert one bit to mask vector, like v16i1 or v8i1.
18877/// AVX-512 feature.
18879 const X86Subtarget &Subtarget) {
18880 SDLoc dl(Op);
18881 SDValue Vec = Op.getOperand(0);
18882 SDValue Elt = Op.getOperand(1);
18883 SDValue Idx = Op.getOperand(2);
18884 MVT VecVT = Vec.getSimpleValueType();
18885
18886 if (!isa<ConstantSDNode>(Idx)) {
18887 // Non constant index. Extend source and destination,
18888 // insert element and then truncate the result.
18889 unsigned NumElts = VecVT.getVectorNumElements();
18890 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18891 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18892 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18893 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18894 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18895 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18896 }
18897
18898 // Copy into a k-register, extract to v1i1 and insert_subvector.
18899 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18900 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18901}
18902
18903SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18904 SelectionDAG &DAG) const {
18905 MVT VT = Op.getSimpleValueType();
18906 MVT EltVT = VT.getVectorElementType();
18907 unsigned NumElts = VT.getVectorNumElements();
18908 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18909
18910 if (EltVT == MVT::i1)
18911 return InsertBitToMaskVector(Op, DAG, Subtarget);
18912
18913 SDLoc dl(Op);
18914 SDValue N0 = Op.getOperand(0);
18915 SDValue N1 = Op.getOperand(1);
18916 SDValue N2 = Op.getOperand(2);
18917 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18918
18919 if (EltVT == MVT::bf16) {
18921 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18922 DAG.getBitcast(IVT, N0),
18923 DAG.getBitcast(MVT::i16, N1), N2);
18924 return DAG.getBitcast(VT, Res);
18925 }
18926
18927 if (!N2C) {
18928 // Variable insertion indices, usually we're better off spilling to stack,
18929 // but AVX512 can use a variable compare+select by comparing against all
18930 // possible vector indices, and FP insertion has less gpr->simd traffic.
18931 if (!(Subtarget.hasBWI() ||
18932 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18933 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18934 return SDValue();
18935
18936 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18937 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18938 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18939 return SDValue();
18940
18941 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18942 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18943 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18944
18945 SmallVector<SDValue, 16> RawIndices;
18946 for (unsigned I = 0; I != NumElts; ++I)
18947 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18948 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18949
18950 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18951 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18953 }
18954
18955 if (N2C->getAPIntValue().uge(NumElts))
18956 return SDValue();
18957 uint64_t IdxVal = N2C->getZExtValue();
18958
18959 bool IsZeroElt = X86::isZeroNode(N1);
18960 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18961
18962 if (IsZeroElt || IsAllOnesElt) {
18963 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18964 // We don't deal with i8 0 since it appears to be handled elsewhere.
18965 if (IsAllOnesElt &&
18966 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18967 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18968 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18969 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18970 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18971 CstVectorElts[IdxVal] = OnesCst;
18972 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18973 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18974 }
18975 // See if we can do this more efficiently with a blend shuffle with a
18976 // rematerializable vector.
18977 if (Subtarget.hasSSE41() &&
18978 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18979 SmallVector<int, 8> BlendMask;
18980 for (unsigned i = 0; i != NumElts; ++i)
18981 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18982 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18983 : getOnesVector(VT, DAG, dl);
18984 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18985 }
18986 }
18987
18988 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18989 // into that, and then insert the subvector back into the result.
18990 if (VT.is256BitVector() || VT.is512BitVector()) {
18991 // With a 256-bit vector, we can insert into the zero element efficiently
18992 // using a blend if we have AVX or AVX2 and the right data type.
18993 if (VT.is256BitVector() && IdxVal == 0) {
18994 // TODO: It is worthwhile to cast integer to floating point and back
18995 // and incur a domain crossing penalty if that's what we'll end up
18996 // doing anyway after extracting to a 128-bit vector.
18997 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18998 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18999 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19000 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19001 DAG.getTargetConstant(1, dl, MVT::i8));
19002 }
19003 }
19004
19005 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19006 assert(isPowerOf2_32(NumEltsIn128) &&
19007 "Vectors will always have power-of-two number of elements.");
19008
19009 // If we are not inserting into the low 128-bit vector chunk,
19010 // then prefer the broadcast+blend sequence.
19011 // FIXME: relax the profitability check iff all N1 uses are insertions.
19012 if (IdxVal >= NumEltsIn128 &&
19013 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19014 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19015 X86::mayFoldLoad(N1, Subtarget)))) {
19016 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19017 SmallVector<int, 8> BlendMask;
19018 for (unsigned i = 0; i != NumElts; ++i)
19019 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19020 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19021 }
19022
19023 // Get the desired 128-bit vector chunk.
19024 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19025
19026 // Insert the element into the desired chunk.
19027 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19028 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19029
19030 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19031 DAG.getVectorIdxConstant(IdxIn128, dl));
19032
19033 // Insert the changed part back into the bigger vector
19034 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19035 }
19036 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19037
19038 // This will be just movw/movd/movq/movsh/movss/movsd.
19039 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19040 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19041 EltVT == MVT::f16 || EltVT == MVT::i64) {
19042 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19043 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19044 }
19045
19046 // We can't directly insert an i8 or i16 into a vector, so zero extend
19047 // it to i32 first.
19048 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19049 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19050 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19051 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19052 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19053 return DAG.getBitcast(VT, N1);
19054 }
19055 }
19056
19057 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19058 // argument. SSE41 required for pinsrb.
19059 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19060 unsigned Opc;
19061 if (VT == MVT::v8i16) {
19062 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19064 } else {
19065 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19066 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19068 }
19069
19070 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19071 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19072 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19073 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19074 }
19075
19076 if (Subtarget.hasSSE41()) {
19077 if (EltVT == MVT::f32) {
19078 // Bits [7:6] of the constant are the source select. This will always be
19079 // zero here. The DAG Combiner may combine an extract_elt index into
19080 // these bits. For example (insert (extract, 3), 2) could be matched by
19081 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19082 // Bits [5:4] of the constant are the destination select. This is the
19083 // value of the incoming immediate.
19084 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19085 // combine either bitwise AND or insert of float 0.0 to set these bits.
19086
19087 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19088 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19089 // If this is an insertion of 32-bits into the low 32-bits of
19090 // a vector, we prefer to generate a blend with immediate rather
19091 // than an insertps. Blends are simpler operations in hardware and so
19092 // will always have equal or better performance than insertps.
19093 // But if optimizing for size and there's a load folding opportunity,
19094 // generate insertps because blendps does not have a 32-bit memory
19095 // operand form.
19096 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19097 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19098 DAG.getTargetConstant(1, dl, MVT::i8));
19099 }
19100 // Create this as a scalar to vector..
19101 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19102 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19103 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19104 }
19105
19106 // PINSR* works with constant index.
19107 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19108 return Op;
19109 }
19110
19111 return SDValue();
19112}
19113
19115 SelectionDAG &DAG) {
19116 SDLoc dl(Op);
19117 MVT OpVT = Op.getSimpleValueType();
19118
19119 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19120 // combines.
19121 if (X86::isZeroNode(Op.getOperand(0)))
19122 return getZeroVector(OpVT, Subtarget, DAG, dl);
19123
19124 // If this is a 256-bit vector result, first insert into a 128-bit
19125 // vector and then insert into the 256-bit vector.
19126 if (!OpVT.is128BitVector()) {
19127 // Insert into a 128-bit vector.
19128 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19130 OpVT.getVectorNumElements() / SizeFactor);
19131
19132 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19133
19134 // Insert the 128-bit vector.
19135 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19136 }
19137 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19138 "Expected an SSE type!");
19139
19140 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19141 // tblgen.
19142 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19143 return Op;
19144
19145 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19146 return DAG.getBitcast(
19147 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19148}
19149
19150// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19151// simple superregister reference or explicit instructions to insert
19152// the upper bits of a vector.
19154 SelectionDAG &DAG) {
19155 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19156
19157 return insert1BitVector(Op, DAG, Subtarget);
19158}
19159
19161 SelectionDAG &DAG) {
19162 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19163 "Only vXi1 extract_subvectors need custom lowering");
19164
19165 SDLoc dl(Op);
19166 SDValue Vec = Op.getOperand(0);
19167 uint64_t IdxVal = Op.getConstantOperandVal(1);
19168
19169 if (IdxVal == 0) // the operation is legal
19170 return Op;
19171
19172 // Extend to natively supported kshift.
19173 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19174
19175 // Shift to the LSB.
19176 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19177 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19178
19179 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19180 DAG.getVectorIdxConstant(0, dl));
19181}
19182
19183// Returns the appropriate wrapper opcode for a global reference.
19184unsigned X86TargetLowering::getGlobalWrapperKind(
19185 const GlobalValue *GV, const unsigned char OpFlags) const {
19186 // References to absolute symbols are never PC-relative.
19187 if (GV && GV->isAbsoluteSymbolRef())
19188 return X86ISD::Wrapper;
19189
19190 // The following OpFlags under RIP-rel PIC use RIP.
19191 if (Subtarget.isPICStyleRIPRel() &&
19192 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19193 OpFlags == X86II::MO_DLLIMPORT))
19194 return X86ISD::WrapperRIP;
19195
19196 // GOTPCREL references must always use RIP.
19197 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19198 return X86ISD::WrapperRIP;
19199
19200 return X86ISD::Wrapper;
19201}
19202
19203// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19204// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19205// one of the above mentioned nodes. It has to be wrapped because otherwise
19206// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19207// be used to form addressing mode. These wrapped nodes will be selected
19208// into MOV32ri.
19209SDValue
19210X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19211 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19212
19213 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19214 // global base reg.
19215 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19216
19217 auto PtrVT = getPointerTy(DAG.getDataLayout());
19219 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19220 SDLoc DL(CP);
19221 Result =
19222 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19223 // With PIC, the address is actually $g + Offset.
19224 if (OpFlag) {
19225 Result =
19226 DAG.getNode(ISD::ADD, DL, PtrVT,
19227 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19228 }
19229
19230 return Result;
19231}
19232
19233SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19234 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19235
19236 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19237 // global base reg.
19238 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19239
19240 EVT PtrVT = Op.getValueType();
19241 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19242 SDLoc DL(JT);
19243 Result =
19244 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19245
19246 // With PIC, the address is actually $g + Offset.
19247 if (OpFlag)
19248 Result =
19249 DAG.getNode(ISD::ADD, DL, PtrVT,
19250 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19251
19252 return Result;
19253}
19254
19255SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19256 SelectionDAG &DAG) const {
19257 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19258}
19259
19260SDValue
19261X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19262 // Create the TargetBlockAddressAddress node.
19263 unsigned char OpFlags =
19265 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19266 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19267 SDLoc dl(Op);
19268 EVT PtrVT = Op.getValueType();
19269 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19270 Result =
19271 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19272
19273 // With PIC, the address is actually $g + Offset.
19274 if (isGlobalRelativeToPICBase(OpFlags)) {
19275 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19276 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19277 }
19278
19279 return Result;
19280}
19281
19282/// Creates target global address or external symbol nodes for calls or
19283/// other uses.
19284SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19285 bool ForCall,
19286 bool *IsImpCall) const {
19287 // Unpack the global address or external symbol.
19288 SDLoc dl(Op);
19289 const GlobalValue *GV = nullptr;
19290 int64_t Offset = 0;
19291 const char *ExternalSym = nullptr;
19292 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19293 GV = G->getGlobal();
19294 Offset = G->getOffset();
19295 } else {
19296 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19297 ExternalSym = ES->getSymbol();
19298 }
19299
19300 // Calculate some flags for address lowering.
19302 unsigned char OpFlags;
19303 if (ForCall)
19304 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19305 else
19306 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19307 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19308 bool NeedsLoad = isGlobalStubReference(OpFlags);
19309
19311 EVT PtrVT = Op.getValueType();
19313
19314 if (GV) {
19315 // Create a target global address if this is a global. If possible, fold the
19316 // offset into the global address reference. Otherwise, ADD it on later.
19317 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19318 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19319 // relocation will compute to a negative value, which is invalid.
19320 int64_t GlobalOffset = 0;
19321 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19323 std::swap(GlobalOffset, Offset);
19324 }
19325 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19326 } else {
19327 // If this is not a global address, this must be an external symbol.
19328 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19329 }
19330
19331 // If this is a direct call, avoid the wrapper if we don't need to do any
19332 // loads or adds. This allows SDAG ISel to match direct calls.
19333 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19334 return Result;
19335
19336 // If Import Call Optimization is enabled and this is an imported function
19337 // then make a note of it and return the global address without wrapping.
19338 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19339 Mod.getModuleFlag("import-call-optimization")) {
19340 assert(ForCall && "Should only enable import call optimization if we are "
19341 "lowering a call");
19342 *IsImpCall = true;
19343 return Result;
19344 }
19345
19346 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19347
19348 // With PIC, the address is actually $g + Offset.
19349 if (HasPICReg) {
19350 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19351 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19352 }
19353
19354 // For globals that require a load from a stub to get the address, emit the
19355 // load.
19356 if (NeedsLoad)
19357 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19359
19360 // If there was a non-zero offset that we didn't fold, create an explicit
19361 // addition for it.
19362 if (Offset != 0)
19363 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19364 DAG.getSignedConstant(Offset, dl, PtrVT));
19365
19366 return Result;
19367}
19368
19369SDValue
19370X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19371 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19372}
19373
19375 const EVT PtrVT, unsigned ReturnReg,
19376 unsigned char OperandFlags,
19377 bool LoadGlobalBaseReg = false,
19378 bool LocalDynamic = false) {
19380 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19381 SDLoc dl(GA);
19382 SDValue TGA;
19383 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19384 SDValue Chain = DAG.getEntryNode();
19385 SDValue Ret;
19386 if (LocalDynamic && UseTLSDESC) {
19387 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19388 // Reuse existing GetTLSADDR node if we can find it.
19389 if (TGA->hasOneUse()) {
19390 // TLSDESC uses TGA.
19391 SDNode *TLSDescOp = *TGA->user_begin();
19392 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19393 "Unexpected TLSDESC DAG");
19394 // CALLSEQ_END uses TGA via a chain and glue.
19395 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19396 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19397 "Unexpected TLSDESC DAG");
19398 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19399 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19400 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19401 "Unexpected TLSDESC DAG");
19402 Ret = SDValue(CopyFromRegOp, 0);
19403 }
19404 } else {
19405 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19406 GA->getOffset(), OperandFlags);
19407 }
19408
19409 if (!Ret) {
19410 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19411 : LocalDynamic ? X86ISD::TLSBASEADDR
19413
19414 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19415 if (LoadGlobalBaseReg) {
19416 SDValue InGlue;
19417 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19418 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19419 InGlue);
19420 InGlue = Chain.getValue(1);
19421 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19422 } else {
19423 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19424 }
19425 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19426
19427 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19428 MFI.setHasCalls(true);
19429
19430 SDValue Glue = Chain.getValue(1);
19431 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19432 }
19433
19434 if (!UseTLSDESC)
19435 return Ret;
19436
19437 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19438 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19439
19441 SDValue Offset =
19442 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19444 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19445}
19446
19447// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19448static SDValue
19450 const EVT PtrVT) {
19451 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19452 /*LoadGlobalBaseReg=*/true);
19453}
19454
19455// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19456static SDValue
19458 const EVT PtrVT) {
19459 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19460}
19461
19462// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19463static SDValue
19465 const EVT PtrVT) {
19466 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19467}
19468
19470 SelectionDAG &DAG, const EVT PtrVT,
19471 bool Is64Bit, bool Is64BitLP64) {
19472 SDLoc dl(GA);
19473
19474 // Get the start address of the TLS block for this module.
19478
19479 SDValue Base;
19480 if (Is64Bit) {
19481 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19482 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19483 /*LoadGlobalBaseReg=*/false,
19484 /*LocalDynamic=*/true);
19485 } else {
19486 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19487 /*LoadGlobalBaseReg=*/true,
19488 /*LocalDynamic=*/true);
19489 }
19490
19491 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19492 // of Base.
19493
19494 // Build x@dtpoff.
19495 unsigned char OperandFlags = X86II::MO_DTPOFF;
19496 unsigned WrapperKind = X86ISD::Wrapper;
19497 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19498 GA->getValueType(0),
19499 GA->getOffset(), OperandFlags);
19500 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19501
19502 // Add x@dtpoff with the base.
19503 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19504}
19505
19506// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19508 const EVT PtrVT, TLSModel::Model model,
19509 bool is64Bit, bool isPIC) {
19510 SDLoc dl(GA);
19511
19512 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19515
19516 SDValue ThreadPointer =
19517 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19519
19520 unsigned char OperandFlags = 0;
19521 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19522 // initialexec.
19523 unsigned WrapperKind = X86ISD::Wrapper;
19524 if (model == TLSModel::LocalExec) {
19525 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19526 } else if (model == TLSModel::InitialExec) {
19527 if (is64Bit) {
19528 OperandFlags = X86II::MO_GOTTPOFF;
19529 WrapperKind = X86ISD::WrapperRIP;
19530 } else {
19531 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19532 }
19533 } else {
19534 llvm_unreachable("Unexpected model");
19535 }
19536
19537 // emit "addl x@ntpoff,%eax" (local exec)
19538 // or "addl x@indntpoff,%eax" (initial exec)
19539 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19540 SDValue TGA =
19541 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19542 GA->getOffset(), OperandFlags);
19543 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19544
19545 if (model == TLSModel::InitialExec) {
19546 if (isPIC && !is64Bit) {
19547 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19548 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19549 Offset);
19550 }
19551
19552 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19554 }
19555
19556 // The address of the thread local variable is the add of the thread
19557 // pointer with the offset of the variable.
19558 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19559}
19560
19561SDValue
19562X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19563
19564 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19565
19566 if (DAG.getTarget().useEmulatedTLS())
19567 return LowerToTLSEmulatedModel(GA, DAG);
19568
19569 const GlobalValue *GV = GA->getGlobal();
19570 EVT PtrVT = Op.getValueType();
19571 bool PositionIndependent = isPositionIndependent();
19572
19573 if (Subtarget.isTargetELF()) {
19574 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19575 switch (model) {
19577 if (Subtarget.is64Bit()) {
19578 if (Subtarget.isTarget64BitLP64())
19579 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19580 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19581 }
19582 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19584 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19585 Subtarget.isTarget64BitLP64());
19588 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19589 PositionIndependent);
19590 }
19591 llvm_unreachable("Unknown TLS model.");
19592 }
19593
19594 if (Subtarget.isTargetDarwin()) {
19595 // Darwin only has one model of TLS. Lower to that.
19596 unsigned char OpFlag = 0;
19597 unsigned WrapperKind = 0;
19598
19599 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19600 // global base reg.
19601 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19602 if (PIC32) {
19603 OpFlag = X86II::MO_TLVP_PIC_BASE;
19604 WrapperKind = X86ISD::Wrapper;
19605 } else {
19606 OpFlag = X86II::MO_TLVP;
19607 WrapperKind = X86ISD::WrapperRIP;
19608 }
19609 SDLoc DL(Op);
19611 GA->getValueType(0),
19612 GA->getOffset(), OpFlag);
19613 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19614
19615 // With PIC32, the address is actually $g + Offset.
19616 if (PIC32)
19617 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19618 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19619 Offset);
19620
19621 // Lowering the machine isd will make sure everything is in the right
19622 // location.
19623 SDValue Chain = DAG.getEntryNode();
19624 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19625 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19626 SDValue Args[] = { Chain, Offset };
19627 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19628 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19629
19630 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19632 MFI.setAdjustsStack(true);
19633
19634 // And our return value (tls address) is in the standard call return value
19635 // location.
19636 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19637 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19638 }
19639
19640 if (Subtarget.isOSWindows()) {
19641 // Just use the implicit TLS architecture
19642 // Need to generate something similar to:
19643 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19644 // ; from TEB
19645 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19646 // mov rcx, qword [rdx+rcx*8]
19647 // mov eax, .tls$:tlsvar
19648 // [rax+rcx] contains the address
19649 // Windows 64bit: gs:0x58
19650 // Windows 32bit: fs:__tls_array
19651
19652 SDLoc dl(GA);
19653 SDValue Chain = DAG.getEntryNode();
19654
19655 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19656 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19657 // use its literal value of 0x2C.
19659 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19661
19662 SDValue TlsArray = Subtarget.is64Bit()
19663 ? DAG.getIntPtrConstant(0x58, dl)
19664 : (Subtarget.isTargetWindowsGNU()
19665 ? DAG.getIntPtrConstant(0x2C, dl)
19666 : DAG.getExternalSymbol("_tls_array", PtrVT));
19667
19669 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19670
19671 SDValue res;
19673 res = ThreadPointer;
19674 } else {
19675 // Load the _tls_index variable
19676 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19677 if (Subtarget.is64Bit())
19678 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19679 MachinePointerInfo(), MVT::i32);
19680 else
19681 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19682
19683 const DataLayout &DL = DAG.getDataLayout();
19684 SDValue Scale =
19685 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19686 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19687
19688 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19689 }
19690
19691 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19692
19693 // Get the offset of start of .tls section
19694 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19695 GA->getValueType(0),
19697 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19698
19699 // The address of the thread local variable is the add of the thread
19700 // pointer with the offset of the variable.
19701 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19702 }
19703
19704 llvm_unreachable("TLS not implemented for this target.");
19705}
19706
19708 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19709 const TargetMachine &TM = getTargetMachine();
19710 TLSModel::Model Model = TM.getTLSModel(&GV);
19711 switch (Model) {
19714 // We can include the %fs segment register in addressing modes.
19715 return true;
19718 // These models do not result in %fs relative addresses unless
19719 // TLS descriptior are used.
19720 //
19721 // Even in the case of TLS descriptors we currently have no way to model
19722 // the difference between %fs access and the computations needed for the
19723 // offset and returning `true` for TLS-desc currently duplicates both
19724 // which is detrimental :-/
19725 return false;
19726 }
19727 }
19728 return false;
19729}
19730
19731/// Lower SRA_PARTS and friends, which return two i32 values
19732/// and take a 2 x i32 value to shift plus a shift amount.
19733/// TODO: Can this be moved to general expansion code?
19735 SDValue Lo, Hi;
19736 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19737 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19738}
19739
19740// Try to use a packed vector operation to handle i64 on 32-bit targets when
19741// AVX512DQ is enabled.
19743 SelectionDAG &DAG,
19744 const X86Subtarget &Subtarget) {
19745 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19746 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19747 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19748 Op.getOpcode() == ISD::UINT_TO_FP) &&
19749 "Unexpected opcode!");
19750 bool IsStrict = Op->isStrictFPOpcode();
19751 unsigned OpNo = IsStrict ? 1 : 0;
19752 SDValue Src = Op.getOperand(OpNo);
19753 MVT SrcVT = Src.getSimpleValueType();
19754 MVT VT = Op.getSimpleValueType();
19755
19756 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19757 (VT != MVT::f32 && VT != MVT::f64))
19758 return SDValue();
19759
19760 // Pack the i64 into a vector, do the operation and extract.
19761
19762 // Using 256-bit to ensure result is 128-bits for f32 case.
19763 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19764 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19765 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19766
19767 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19768 if (IsStrict) {
19769 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19770 {Op.getOperand(0), InVec});
19771 SDValue Chain = CvtVec.getValue(1);
19772 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19773 DAG.getVectorIdxConstant(0, dl));
19774 return DAG.getMergeValues({Value, Chain}, dl);
19775 }
19776
19777 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19778
19779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19780 DAG.getVectorIdxConstant(0, dl));
19781}
19782
19783// Try to use a packed vector operation to handle i64 on 32-bit targets.
19785 const X86Subtarget &Subtarget) {
19786 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19787 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19788 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19789 Op.getOpcode() == ISD::UINT_TO_FP) &&
19790 "Unexpected opcode!");
19791 bool IsStrict = Op->isStrictFPOpcode();
19792 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19793 MVT SrcVT = Src.getSimpleValueType();
19794 MVT VT = Op.getSimpleValueType();
19795
19796 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19797 return SDValue();
19798
19799 // Pack the i64 into a vector, do the operation and extract.
19800
19801 assert(Subtarget.hasFP16() && "Expected FP16");
19802
19803 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19804 if (IsStrict) {
19805 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19806 {Op.getOperand(0), InVec});
19807 SDValue Chain = CvtVec.getValue(1);
19808 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19809 DAG.getVectorIdxConstant(0, dl));
19810 return DAG.getMergeValues({Value, Chain}, dl);
19811 }
19812
19813 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19814
19815 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19816 DAG.getVectorIdxConstant(0, dl));
19817}
19818
19819static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19820 const X86Subtarget &Subtarget) {
19821 switch (Opcode) {
19822 case ISD::SINT_TO_FP:
19823 // TODO: Handle wider types with AVX/AVX512.
19824 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19825 return false;
19826 // CVTDQ2PS or (V)CVTDQ2PD
19827 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19828
19829 case ISD::UINT_TO_FP:
19830 // TODO: Handle wider types and i64 elements.
19831 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19832 return false;
19833 // VCVTUDQ2PS or VCVTUDQ2PD
19834 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19835
19836 default:
19837 return false;
19838 }
19839}
19840
19841/// Given a scalar cast operation that is extracted from a vector, try to
19842/// vectorize the cast op followed by extraction. This will avoid an expensive
19843/// round-trip between XMM and GPR.
19845 SelectionDAG &DAG,
19846 const X86Subtarget &Subtarget) {
19847 // TODO: This could be enhanced to handle smaller integer types by peeking
19848 // through an extend.
19849 SDValue Extract = Cast.getOperand(0);
19850 MVT DestVT = Cast.getSimpleValueType();
19851 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19852 !isa<ConstantSDNode>(Extract.getOperand(1)))
19853 return SDValue();
19854
19855 // See if we have a 128-bit vector cast op for this type of cast.
19856 SDValue VecOp = Extract.getOperand(0);
19857 MVT FromVT = VecOp.getSimpleValueType();
19858 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19859 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19860 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19861 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19862 return SDValue();
19863
19864 // If we are extracting from a non-zero element, first shuffle the source
19865 // vector to allow extracting from element zero.
19866 if (!isNullConstant(Extract.getOperand(1))) {
19867 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19868 Mask[0] = Extract.getConstantOperandVal(1);
19869 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19870 }
19871 // If the source vector is wider than 128-bits, extract the low part. Do not
19872 // create an unnecessarily wide vector cast op.
19873 if (FromVT != Vec128VT)
19874 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19875
19876 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19877 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19878 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19879 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19880 DAG.getVectorIdxConstant(0, DL));
19881}
19882
19883/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19884/// try to vectorize the cast ops. This will avoid an expensive round-trip
19885/// between XMM and GPR.
19886static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19887 SelectionDAG &DAG,
19888 const X86Subtarget &Subtarget) {
19889 // TODO: Allow FP_TO_UINT.
19890 SDValue CastToInt = CastToFP.getOperand(0);
19891 MVT VT = CastToFP.getSimpleValueType();
19892 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19893 return SDValue();
19894
19895 MVT IntVT = CastToInt.getSimpleValueType();
19896 SDValue X = CastToInt.getOperand(0);
19897 MVT SrcVT = X.getSimpleValueType();
19898 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19899 return SDValue();
19900
19901 // See if we have 128-bit vector cast instructions for this type of cast.
19902 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19903 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19904 IntVT != MVT::i32)
19905 return SDValue();
19906
19907 unsigned SrcSize = SrcVT.getSizeInBits();
19908 unsigned IntSize = IntVT.getSizeInBits();
19909 unsigned VTSize = VT.getSizeInBits();
19910 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19911 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19912 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19913
19914 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19915 unsigned ToIntOpcode =
19916 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19917 unsigned ToFPOpcode =
19918 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19919
19920 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19921 //
19922 // We are not defining the high elements (for example, zero them) because
19923 // that could nullify any performance advantage that we hoped to gain from
19924 // this vector op hack. We do not expect any adverse effects (like denorm
19925 // penalties) with cast ops.
19926 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19927 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19928 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19929 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19930 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19931}
19932
19934 SelectionDAG &DAG,
19935 const X86Subtarget &Subtarget) {
19936 bool IsStrict = Op->isStrictFPOpcode();
19937 MVT VT = Op->getSimpleValueType(0);
19938 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19939
19940 if (Subtarget.hasDQI()) {
19941 assert(!Subtarget.hasVLX() && "Unexpected features");
19942
19943 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19944 Src.getSimpleValueType() == MVT::v4i64) &&
19945 "Unsupported custom type");
19946
19947 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19948 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19949 "Unexpected VT!");
19950 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19951
19952 // Need to concat with zero vector for strict fp to avoid spurious
19953 // exceptions.
19954 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19955 : DAG.getUNDEF(MVT::v8i64);
19956 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19957 DAG.getVectorIdxConstant(0, DL));
19958 SDValue Res, Chain;
19959 if (IsStrict) {
19960 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19961 {Op->getOperand(0), Src});
19962 Chain = Res.getValue(1);
19963 } else {
19964 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19965 }
19966
19967 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19968 DAG.getVectorIdxConstant(0, DL));
19969
19970 if (IsStrict)
19971 return DAG.getMergeValues({Res, Chain}, DL);
19972 return Res;
19973 }
19974
19975 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19976 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19977 if (VT != MVT::v4f32 || IsSigned)
19978 return SDValue();
19979
19980 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19981 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19982 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19983 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19984 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19985 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19986 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19987 SmallVector<SDValue, 4> SignCvts(4);
19988 SmallVector<SDValue, 4> Chains(4);
19989 for (int i = 0; i != 4; ++i) {
19990 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19991 DAG.getVectorIdxConstant(i, DL));
19992 if (IsStrict) {
19993 SignCvts[i] =
19994 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19995 {Op.getOperand(0), Elt});
19996 Chains[i] = SignCvts[i].getValue(1);
19997 } else {
19998 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19999 }
20000 }
20001 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20002
20003 SDValue Slow, Chain;
20004 if (IsStrict) {
20005 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20006 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20007 {Chain, SignCvt, SignCvt});
20008 Chain = Slow.getValue(1);
20009 } else {
20010 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20011 }
20012
20013 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20014 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20015
20016 if (IsStrict)
20017 return DAG.getMergeValues({Cvt, Chain}, DL);
20018
20019 return Cvt;
20020}
20021
20023 SelectionDAG &DAG) {
20024 bool IsStrict = Op->isStrictFPOpcode();
20025 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20026 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20027 MVT VT = Op.getSimpleValueType();
20028 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20029
20030 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20031 if (IsStrict)
20032 return DAG.getNode(
20033 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20034 {Chain,
20035 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20036 Rnd});
20037 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20038 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20039}
20040
20041static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20042 const X86Subtarget &Subtarget) {
20043 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20044 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20045 return true;
20046 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20047 return true;
20048 }
20049 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20050 return true;
20051 if (Subtarget.useAVX512Regs()) {
20052 if (VT == MVT::v16i32)
20053 return true;
20054 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20055 return true;
20056 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20057 return true;
20058 }
20059 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20060 (VT == MVT::v2i64 || VT == MVT::v4i64))
20061 return true;
20062 return false;
20063}
20064
20065SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20066 SelectionDAG &DAG) const {
20067 bool IsStrict = Op->isStrictFPOpcode();
20068 unsigned OpNo = IsStrict ? 1 : 0;
20069 SDValue Src = Op.getOperand(OpNo);
20070 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20071 MVT SrcVT = Src.getSimpleValueType();
20072 MVT VT = Op.getSimpleValueType();
20073 SDLoc dl(Op);
20074
20075 if (isSoftF16(VT, Subtarget))
20076 return promoteXINT_TO_FP(Op, dl, DAG);
20077 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20078 return Op;
20079
20080 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20081 return LowerWin64_INT128_TO_FP(Op, DAG);
20082
20083 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20084 return Extract;
20085
20086 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20087 return R;
20088
20089 if (SrcVT.isVector()) {
20090 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20091 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20092 // source for strict FP.
20093 if (IsStrict)
20094 return DAG.getNode(
20095 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20096 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20097 DAG.getUNDEF(SrcVT))});
20098 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20099 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20100 DAG.getUNDEF(SrcVT)));
20101 }
20102 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20103 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20104
20105 return SDValue();
20106 }
20107
20108 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20109 "Unknown SINT_TO_FP to lower!");
20110
20111 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20112
20113 // These are really Legal; return the operand so the caller accepts it as
20114 // Legal.
20115 if (SrcVT == MVT::i32 && UseSSEReg)
20116 return Op;
20117 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20118 return Op;
20119
20120 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20121 return V;
20122 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20123 return V;
20124
20125 // SSE doesn't have an i16 conversion so we need to promote.
20126 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20127 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20128 if (IsStrict)
20129 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20130 {Chain, Ext});
20131
20132 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20133 }
20134
20135 if (VT == MVT::f128 || !Subtarget.hasX87())
20136 return SDValue();
20137
20138 SDValue ValueToStore = Src;
20139 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20140 // Bitcasting to f64 here allows us to do a single 64-bit store from
20141 // an SSE register, avoiding the store forwarding penalty that would come
20142 // with two 32-bit stores.
20143 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20144
20145 unsigned Size = SrcVT.getStoreSize();
20146 Align Alignment(Size);
20148 auto PtrVT = getPointerTy(MF.getDataLayout());
20149 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20150 MachinePointerInfo MPI =
20152 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20153 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20154 std::pair<SDValue, SDValue> Tmp =
20155 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20156
20157 if (IsStrict)
20158 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20159
20160 return Tmp.first;
20161}
20162
20163std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20164 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20165 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20166 // Build the FILD
20167 SDVTList Tys;
20168 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20169 if (useSSE)
20170 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20171 else
20172 Tys = DAG.getVTList(DstVT, MVT::Other);
20173
20174 SDValue FILDOps[] = {Chain, Pointer};
20175 SDValue Result =
20176 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20177 Alignment, MachineMemOperand::MOLoad);
20178 Chain = Result.getValue(1);
20179
20180 if (useSSE) {
20182 unsigned SSFISize = DstVT.getStoreSize();
20183 int SSFI =
20184 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20185 auto PtrVT = getPointerTy(MF.getDataLayout());
20186 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20187 Tys = DAG.getVTList(MVT::Other);
20188 SDValue FSTOps[] = {Chain, Result, StackSlot};
20191 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20192
20193 Chain =
20194 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20195 Result = DAG.getLoad(
20196 DstVT, DL, Chain, StackSlot,
20198 Chain = Result.getValue(1);
20199 }
20200
20201 return { Result, Chain };
20202}
20203
20204/// Horizontal vector math instructions may be slower than normal math with
20205/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20206/// implementation, and likely shuffle complexity of the alternate sequence.
20207static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20208 const X86Subtarget &Subtarget) {
20209 bool IsOptimizingSize = DAG.shouldOptForSize();
20210 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20211 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20212}
20213
20214/// 64-bit unsigned integer to double expansion.
20216 SelectionDAG &DAG,
20217 const X86Subtarget &Subtarget) {
20218 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20219 // when converting 0 when rounding toward negative infinity. Caller will
20220 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20221 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20222 // This algorithm is not obvious. Here it is what we're trying to output:
20223 /*
20224 movq %rax, %xmm0
20225 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20226 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20227 #ifdef __SSE3__
20228 haddpd %xmm0, %xmm0
20229 #else
20230 pshufd $0x4e, %xmm0, %xmm1
20231 addpd %xmm1, %xmm0
20232 #endif
20233 */
20234
20236
20237 // Build some magic constants.
20238 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20240 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20241 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20242
20244 CV1.push_back(
20245 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20246 APInt(64, 0x4330000000000000ULL))));
20247 CV1.push_back(
20248 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20249 APInt(64, 0x4530000000000000ULL))));
20250 Constant *C1 = ConstantVector::get(CV1);
20251 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20252
20253 // Load the 64-bit value into an XMM register.
20254 SDValue XR1 =
20255 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20256 SDValue CLod0 = DAG.getLoad(
20257 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20259 SDValue Unpck1 =
20260 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20261
20262 SDValue CLod1 = DAG.getLoad(
20263 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20265 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20266 // TODO: Are there any fast-math-flags to propagate here?
20267 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20268 SDValue Result;
20269
20270 if (Subtarget.hasSSE3() &&
20271 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20272 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20273 } else {
20274 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20275 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20276 }
20277 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20278 DAG.getVectorIdxConstant(0, dl));
20279 return Result;
20280}
20281
20282/// 32-bit unsigned integer to float expansion.
20284 SelectionDAG &DAG,
20285 const X86Subtarget &Subtarget) {
20286 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20287 // FP constant to bias correct the final result.
20288 SDValue Bias = DAG.getConstantFP(
20289 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20290
20291 // Load the 32-bit value into an XMM register.
20292 SDValue Load =
20293 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20294
20295 // Zero out the upper parts of the register.
20296 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20297
20298 // Or the load with the bias.
20299 SDValue Or = DAG.getNode(
20300 ISD::OR, dl, MVT::v2i64,
20301 DAG.getBitcast(MVT::v2i64, Load),
20302 DAG.getBitcast(MVT::v2i64,
20303 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20304 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20305 DAG.getBitcast(MVT::v2f64, Or),
20306 DAG.getVectorIdxConstant(0, dl));
20307
20308 if (Op.getNode()->isStrictFPOpcode()) {
20309 // Subtract the bias.
20310 // TODO: Are there any fast-math-flags to propagate here?
20311 SDValue Chain = Op.getOperand(0);
20312 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20313 {Chain, Or, Bias});
20314
20315 if (Op.getValueType() == Sub.getValueType())
20316 return Sub;
20317
20318 // Handle final rounding.
20319 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20320 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20321
20322 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20323 }
20324
20325 // Subtract the bias.
20326 // TODO: Are there any fast-math-flags to propagate here?
20327 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20328
20329 // Handle final rounding.
20330 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20331}
20332
20334 SelectionDAG &DAG,
20335 const X86Subtarget &Subtarget) {
20336 if (Op.getSimpleValueType() != MVT::v2f64)
20337 return SDValue();
20338
20339 bool IsStrict = Op->isStrictFPOpcode();
20340
20341 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20342 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20343
20344 if (Subtarget.hasAVX512()) {
20345 if (!Subtarget.hasVLX()) {
20346 // Let generic type legalization widen this.
20347 if (!IsStrict)
20348 return SDValue();
20349 // Otherwise pad the integer input with 0s and widen the operation.
20350 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20351 DAG.getConstant(0, DL, MVT::v2i32));
20352 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20353 {Op.getOperand(0), N0});
20354 SDValue Chain = Res.getValue(1);
20355 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20356 DAG.getVectorIdxConstant(0, DL));
20357 return DAG.getMergeValues({Res, Chain}, DL);
20358 }
20359
20360 // Legalize to v4i32 type.
20361 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20362 DAG.getUNDEF(MVT::v2i32));
20363 if (IsStrict)
20364 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20365 {Op.getOperand(0), N0});
20366 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20367 }
20368
20369 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20370 // This gives us the floating point equivalent of 2^52 + the i32 integer
20371 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20372 // point leaving just our i32 integers in double format.
20373 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20374 SDValue VBias = DAG.getConstantFP(
20375 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20376 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20377 DAG.getBitcast(MVT::v2i64, VBias));
20378 Or = DAG.getBitcast(MVT::v2f64, Or);
20379
20380 if (IsStrict)
20381 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20382 {Op.getOperand(0), Or, VBias});
20383 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20384}
20385
20387 SelectionDAG &DAG,
20388 const X86Subtarget &Subtarget) {
20389 bool IsStrict = Op->isStrictFPOpcode();
20390 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20391 MVT VecIntVT = V.getSimpleValueType();
20392 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20393 "Unsupported custom type");
20394
20395 if (Subtarget.hasAVX512()) {
20396 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20397 assert(!Subtarget.hasVLX() && "Unexpected features");
20398 MVT VT = Op->getSimpleValueType(0);
20399
20400 // v8i32->v8f64 is legal with AVX512 so just return it.
20401 if (VT == MVT::v8f64)
20402 return Op;
20403
20404 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20405 VT == MVT::v8f16) &&
20406 "Unexpected VT!");
20407 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20408 MVT WideIntVT = MVT::v16i32;
20409 if (VT == MVT::v4f64) {
20410 WideVT = MVT::v8f64;
20411 WideIntVT = MVT::v8i32;
20412 }
20413
20414 // Need to concat with zero vector for strict fp to avoid spurious
20415 // exceptions.
20416 SDValue Tmp =
20417 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20418 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20419 DAG.getVectorIdxConstant(0, DL));
20420 SDValue Res, Chain;
20421 if (IsStrict) {
20422 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20423 {Op->getOperand(0), V});
20424 Chain = Res.getValue(1);
20425 } else {
20426 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20427 }
20428
20429 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20430 DAG.getVectorIdxConstant(0, DL));
20431
20432 if (IsStrict)
20433 return DAG.getMergeValues({Res, Chain}, DL);
20434 return Res;
20435 }
20436
20437 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20438 Op->getSimpleValueType(0) == MVT::v4f64) {
20439 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20440 Constant *Bias = ConstantFP::get(
20441 *DAG.getContext(),
20442 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20443 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20444 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20445 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20446 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20447 SDValue VBias = DAG.getMemIntrinsicNode(
20448 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20451
20452 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20453 DAG.getBitcast(MVT::v4i64, VBias));
20454 Or = DAG.getBitcast(MVT::v4f64, Or);
20455
20456 if (IsStrict)
20457 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20458 {Op.getOperand(0), Or, VBias});
20459 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20460 }
20461
20462 // The algorithm is the following:
20463 // #ifdef __SSE4_1__
20464 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20465 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20466 // (uint4) 0x53000000, 0xaa);
20467 // #else
20468 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20469 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20470 // #endif
20471 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20472 // return (float4) lo + fhi;
20473
20474 bool Is128 = VecIntVT == MVT::v4i32;
20475 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20476 // If we convert to something else than the supported type, e.g., to v4f64,
20477 // abort early.
20478 if (VecFloatVT != Op->getSimpleValueType(0))
20479 return SDValue();
20480
20481 // In the #idef/#else code, we have in common:
20482 // - The vector of constants:
20483 // -- 0x4b000000
20484 // -- 0x53000000
20485 // - A shift:
20486 // -- v >> 16
20487
20488 // Create the splat vector for 0x4b000000.
20489 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20490 // Create the splat vector for 0x53000000.
20491 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20492
20493 // Create the right shift.
20494 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20495 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20496
20497 SDValue Low, High;
20498 if (Subtarget.hasSSE41()) {
20499 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20500 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20501 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20502 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20503 // Low will be bitcasted right away, so do not bother bitcasting back to its
20504 // original type.
20505 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20506 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20507 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20508 // (uint4) 0x53000000, 0xaa);
20509 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20510 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20511 // High will be bitcasted right away, so do not bother bitcasting back to
20512 // its original type.
20513 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20514 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20515 } else {
20516 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20517 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20518 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20519 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20520
20521 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20522 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20523 }
20524
20525 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20526 SDValue VecCstFSub = DAG.getConstantFP(
20527 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20528
20529 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20530 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20531 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20532 // enabled. See PR24512.
20533 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20534 // TODO: Are there any fast-math-flags to propagate here?
20535 // (float4) lo;
20536 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20537 // return (float4) lo + fhi;
20538 if (IsStrict) {
20539 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20540 {Op.getOperand(0), HighBitcast, VecCstFSub});
20541 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20542 {FHigh.getValue(1), LowBitcast, FHigh});
20543 }
20544
20545 SDValue FHigh =
20546 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20547 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20548}
20549
20551 const X86Subtarget &Subtarget) {
20552 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20553 SDValue N0 = Op.getOperand(OpNo);
20554 MVT SrcVT = N0.getSimpleValueType();
20555
20556 switch (SrcVT.SimpleTy) {
20557 default:
20558 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20559 case MVT::v2i32:
20560 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20561 case MVT::v4i32:
20562 case MVT::v8i32:
20563 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20564 case MVT::v2i64:
20565 case MVT::v4i64:
20566 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20567 }
20568}
20569
20570SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20571 SelectionDAG &DAG) const {
20572 bool IsStrict = Op->isStrictFPOpcode();
20573 unsigned OpNo = IsStrict ? 1 : 0;
20574 SDValue Src = Op.getOperand(OpNo);
20575 SDLoc dl(Op);
20576 auto PtrVT = getPointerTy(DAG.getDataLayout());
20577 MVT SrcVT = Src.getSimpleValueType();
20578 MVT DstVT = Op->getSimpleValueType(0);
20579 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20580
20581 // Bail out when we don't have native conversion instructions.
20582 if (DstVT == MVT::f128)
20583 return SDValue();
20584
20585 if (isSoftF16(DstVT, Subtarget))
20586 return promoteXINT_TO_FP(Op, dl, DAG);
20587 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20588 return Op;
20589
20590 if (DstVT.isVector())
20591 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20592
20593 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20594 return LowerWin64_INT128_TO_FP(Op, DAG);
20595
20596 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20597 return Extract;
20598
20599 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20600 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20601 // Conversions from unsigned i32 to f32/f64 are legal,
20602 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20603 return Op;
20604 }
20605
20606 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20607 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20608 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20609 if (IsStrict)
20610 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20611 {Chain, Src});
20612 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20613 }
20614
20615 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20616 return V;
20617 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20618 return V;
20619
20620 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20621 // infinity. It produces -0.0, so disable under strictfp.
20622 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20623 !IsStrict)
20624 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20625 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20626 // negative infinity. So disable under strictfp. Using FILD instead.
20627 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20628 !IsStrict)
20629 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20630 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20631 (DstVT == MVT::f32 || DstVT == MVT::f64))
20632 return SDValue();
20633
20634 // Make a 64-bit buffer, and use it to build an FILD.
20635 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20636 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20637 Align SlotAlign(8);
20638 MachinePointerInfo MPI =
20640 if (SrcVT == MVT::i32) {
20641 SDValue OffsetSlot =
20642 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20643 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20644 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20645 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20646 std::pair<SDValue, SDValue> Tmp =
20647 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20648 if (IsStrict)
20649 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20650
20651 return Tmp.first;
20652 }
20653
20654 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20655 SDValue ValueToStore = Src;
20656 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20657 // Bitcasting to f64 here allows us to do a single 64-bit store from
20658 // an SSE register, avoiding the store forwarding penalty that would come
20659 // with two 32-bit stores.
20660 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20661 }
20662 SDValue Store =
20663 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20664 // For i64 source, we need to add the appropriate power of 2 if the input
20665 // was negative. We must be careful to do the computation in x87 extended
20666 // precision, not in SSE.
20667 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20668 SDValue Ops[] = {Store, StackSlot};
20669 SDValue Fild =
20670 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20671 SlotAlign, MachineMemOperand::MOLoad);
20672 Chain = Fild.getValue(1);
20673
20674 // Check whether the sign bit is set.
20675 SDValue SignSet = DAG.getSetCC(
20676 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20677 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20678
20679 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20680 APInt FF(64, 0x5F80000000000000ULL);
20681 SDValue FudgePtr =
20682 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20683 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20684
20685 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20686 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20687 SDValue Four = DAG.getIntPtrConstant(4, dl);
20688 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20689 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20690
20691 // Load the value out, extending it from f32 to f80.
20692 SDValue Fudge = DAG.getExtLoad(
20693 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20695 CPAlignment);
20696 Chain = Fudge.getValue(1);
20697 // Extend everything to 80 bits to force it to be done on x87.
20698 // TODO: Are there any fast-math-flags to propagate here?
20699 if (IsStrict) {
20700 unsigned Opc = ISD::STRICT_FADD;
20701 // Windows needs the precision control changed to 80bits around this add.
20702 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20704
20705 SDValue Add =
20706 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20707 // STRICT_FP_ROUND can't handle equal types.
20708 if (DstVT == MVT::f80)
20709 return Add;
20710 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20711 {Add.getValue(1), Add,
20712 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20713 }
20714 unsigned Opc = ISD::FADD;
20715 // Windows needs the precision control changed to 80bits around this add.
20716 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20718
20719 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20720 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20721 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20722}
20723
20724// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20725// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20726// just return an SDValue().
20727// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20728// to i16, i32 or i64, and we lower it to a legal sequence and return the
20729// result.
20730SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20731 bool IsSigned,
20732 SDValue &Chain) const {
20733 bool IsStrict = Op->isStrictFPOpcode();
20734 SDLoc DL(Op);
20735
20736 EVT DstTy = Op.getValueType();
20737 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20738 EVT TheVT = Value.getValueType();
20739 auto PtrVT = getPointerTy(DAG.getDataLayout());
20740
20741 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20742 // f16 must be promoted before using the lowering in this routine.
20743 // fp128 does not use this lowering.
20744 return SDValue();
20745 }
20746
20747 // If using FIST to compute an unsigned i64, we'll need some fixup
20748 // to handle values above the maximum signed i64. A FIST is always
20749 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20750 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20751
20752 // FIXME: This does not generate an invalid exception if the input does not
20753 // fit in i32. PR44019
20754 if (!IsSigned && DstTy != MVT::i64) {
20755 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20756 // The low 32 bits of the fist result will have the correct uint32 result.
20757 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20758 DstTy = MVT::i64;
20759 }
20760
20761 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20762 DstTy.getSimpleVT() >= MVT::i16 &&
20763 "Unknown FP_TO_INT to lower!");
20764
20765 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20766 // stack slot.
20768 unsigned MemSize = DstTy.getStoreSize();
20769 int SSFI =
20770 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20771 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20772
20773 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20774
20775 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20776
20777 if (UnsignedFixup) {
20778 //
20779 // Conversion to unsigned i64 is implemented with a select,
20780 // depending on whether the source value fits in the range
20781 // of a signed i64. Let Thresh be the FP equivalent of
20782 // 0x8000000000000000ULL.
20783 //
20784 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20785 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20786 // FistSrc = (Value - FltOfs);
20787 // Fist-to-mem64 FistSrc
20788 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20789 // to XOR'ing the high 32 bits with Adjust.
20790 //
20791 // Being a power of 2, Thresh is exactly representable in all FP formats.
20792 // For X87 we'd like to use the smallest FP type for this constant, but
20793 // for DAG type consistency we have to match the FP operand type.
20794
20795 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20797 bool LosesInfo = false;
20798 if (TheVT == MVT::f64)
20799 // The rounding mode is irrelevant as the conversion should be exact.
20801 &LosesInfo);
20802 else if (TheVT == MVT::f80)
20803 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20804 APFloat::rmNearestTiesToEven, &LosesInfo);
20805
20806 assert(Status == APFloat::opOK && !LosesInfo &&
20807 "FP conversion should have been exact");
20808
20809 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20810
20811 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20812 *DAG.getContext(), TheVT);
20813 SDValue Cmp;
20814 if (IsStrict) {
20815 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20816 /*IsSignaling*/ true);
20817 Chain = Cmp.getValue(1);
20818 } else {
20819 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20820 }
20821
20822 // Our preferred lowering of
20823 //
20824 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20825 //
20826 // is
20827 //
20828 // (Value >= Thresh) << 63
20829 //
20830 // but since we can get here after LegalOperations, DAGCombine might do the
20831 // wrong thing if we create a select. So, directly create the preferred
20832 // version.
20833 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20834 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20835 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20836
20837 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20838 DAG.getConstantFP(0.0, DL, TheVT));
20839
20840 if (IsStrict) {
20841 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20842 { Chain, Value, FltOfs });
20843 Chain = Value.getValue(1);
20844 } else
20845 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20846 }
20847
20849
20850 // FIXME This causes a redundant load/store if the SSE-class value is already
20851 // in memory, such as if it is on the callstack.
20852 if (isScalarFPTypeInSSEReg(TheVT)) {
20853 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20854 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20855 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20856 SDValue Ops[] = { Chain, StackSlot };
20857
20858 unsigned FLDSize = TheVT.getStoreSize();
20859 assert(FLDSize <= MemSize && "Stack slot not big enough");
20861 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20862 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20863 Chain = Value.getValue(1);
20864 }
20865
20866 // Build the FP_TO_INT*_IN_MEM
20868 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20869 SDValue Ops[] = { Chain, Value, StackSlot };
20871 DAG.getVTList(MVT::Other),
20872 Ops, DstTy, MMO);
20873
20874 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20875 Chain = Res.getValue(1);
20876
20877 // If we need an unsigned fixup, XOR the result with adjust.
20878 if (UnsignedFixup)
20879 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20880
20881 return Res;
20882}
20883
20885 const X86Subtarget &Subtarget) {
20886 MVT VT = Op.getSimpleValueType();
20887 SDValue In = Op.getOperand(0);
20888 MVT InVT = In.getSimpleValueType();
20889 unsigned Opc = Op.getOpcode();
20890
20891 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20893 "Unexpected extension opcode");
20895 "Expected same number of elements");
20896 assert((VT.getVectorElementType() == MVT::i16 ||
20897 VT.getVectorElementType() == MVT::i32 ||
20898 VT.getVectorElementType() == MVT::i64) &&
20899 "Unexpected element type");
20900 assert((InVT.getVectorElementType() == MVT::i8 ||
20901 InVT.getVectorElementType() == MVT::i16 ||
20902 InVT.getVectorElementType() == MVT::i32) &&
20903 "Unexpected element type");
20904
20905 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20906
20907 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20908 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20909 return splitVectorIntUnary(Op, DAG, dl);
20910 }
20911
20912 if (Subtarget.hasInt256())
20913 return Op;
20914
20915 // Optimize vectors in AVX mode:
20916 //
20917 // v8i16 -> v8i32
20918 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20919 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20920 // Concat upper and lower parts.
20921 //
20922 // v4i32 -> v4i64
20923 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20924 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20925 // Concat upper and lower parts.
20926 //
20927 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20928 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20929
20930 // Short-circuit if we can determine that each 128-bit half is the same value.
20931 // Otherwise, this is difficult to match and optimize.
20932 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20933 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20934 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20935
20936 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20937 SDValue Undef = DAG.getUNDEF(InVT);
20938 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20939 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20940 OpHi = DAG.getBitcast(HalfVT, OpHi);
20941
20942 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20943}
20944
20945// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20946static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20947 const SDLoc &dl, SelectionDAG &DAG) {
20948 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20949 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20950 DAG.getVectorIdxConstant(0, dl));
20951 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20952 DAG.getVectorIdxConstant(8, dl));
20953 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20954 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20955 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20956 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20957}
20958
20960 const X86Subtarget &Subtarget,
20961 SelectionDAG &DAG) {
20962 MVT VT = Op->getSimpleValueType(0);
20963 SDValue In = Op->getOperand(0);
20964 MVT InVT = In.getSimpleValueType();
20965 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20966 unsigned NumElts = VT.getVectorNumElements();
20967
20968 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20969 // avoids a constant pool load.
20970 if (VT.getVectorElementType() != MVT::i8) {
20971 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20972 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20973 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20974 }
20975
20976 // Extend VT if BWI is not supported.
20977 MVT ExtVT = VT;
20978 if (!Subtarget.hasBWI()) {
20979 // If v16i32 is to be avoided, we'll need to split and concatenate.
20980 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20981 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20982
20983 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20984 }
20985
20986 // Widen to 512-bits if VLX is not supported.
20987 MVT WideVT = ExtVT;
20988 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20989 NumElts *= 512 / ExtVT.getSizeInBits();
20990 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20991 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20992 DAG.getVectorIdxConstant(0, DL));
20993 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20994 }
20995
20996 SDValue One = DAG.getConstant(1, DL, WideVT);
20997 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20998
20999 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21000
21001 // Truncate if we had to extend above.
21002 if (VT != ExtVT) {
21003 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21004 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21005 }
21006
21007 // Extract back to 128/256-bit if we widened.
21008 if (WideVT != VT)
21009 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21010 DAG.getVectorIdxConstant(0, DL));
21011
21012 return SelectedVal;
21013}
21014
21016 SelectionDAG &DAG) {
21017 SDValue In = Op.getOperand(0);
21018 MVT SVT = In.getSimpleValueType();
21019 SDLoc DL(Op);
21020
21021 if (SVT.getVectorElementType() == MVT::i1)
21022 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21023
21024 assert(Subtarget.hasAVX() && "Expected AVX support");
21025 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21026}
21027
21028/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21029/// It makes use of the fact that vectors with enough leading sign/zero bits
21030/// prevent the PACKSS/PACKUS from saturating the results.
21031/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21032/// within each 128-bit lane.
21033static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21034 const SDLoc &DL, SelectionDAG &DAG,
21035 const X86Subtarget &Subtarget) {
21036 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21037 "Unexpected PACK opcode");
21038 assert(DstVT.isVector() && "VT not a vector?");
21039
21040 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21041 if (!Subtarget.hasSSE2())
21042 return SDValue();
21043
21044 EVT SrcVT = In.getValueType();
21045
21046 // No truncation required, we might get here due to recursive calls.
21047 if (SrcVT == DstVT)
21048 return In;
21049
21050 unsigned NumElems = SrcVT.getVectorNumElements();
21051 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21052 return SDValue();
21053
21054 unsigned DstSizeInBits = DstVT.getSizeInBits();
21055 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21056 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21057 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21058
21059 LLVMContext &Ctx = *DAG.getContext();
21060 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21061 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21062
21063 // Pack to the largest type possible:
21064 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21065 EVT InVT = MVT::i16, OutVT = MVT::i8;
21066 if (SrcVT.getScalarSizeInBits() > 16 &&
21067 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21068 InVT = MVT::i32;
21069 OutVT = MVT::i16;
21070 }
21071
21072 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21073 // On pre-AVX512, pack the src in both halves to help value tracking.
21074 if (SrcSizeInBits <= 128) {
21075 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21076 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21077 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21078 SDValue LHS = DAG.getBitcast(InVT, In);
21079 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21080 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21081 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21082 Res = DAG.getBitcast(PackedVT, Res);
21083 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21084 }
21085
21086 // Split lower/upper subvectors.
21087 SDValue Lo, Hi;
21088 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21089
21090 // If Hi is undef, then don't bother packing it and widen the result instead.
21091 if (Hi.isUndef()) {
21092 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21093 if (SDValue Res =
21094 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21095 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21096 }
21097
21098 unsigned SubSizeInBits = SrcSizeInBits / 2;
21099 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21100 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21101
21102 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21103 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21104 Lo = DAG.getBitcast(InVT, Lo);
21105 Hi = DAG.getBitcast(InVT, Hi);
21106 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21107 return DAG.getBitcast(DstVT, Res);
21108 }
21109
21110 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21111 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21112 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21113 Lo = DAG.getBitcast(InVT, Lo);
21114 Hi = DAG.getBitcast(InVT, Hi);
21115 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21116
21117 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21118 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21119 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21121 int Scale = 64 / OutVT.getScalarSizeInBits();
21122 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21123 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21124
21125 if (DstVT.is256BitVector())
21126 return DAG.getBitcast(DstVT, Res);
21127
21128 // If 512bit -> 128bit truncate another stage.
21129 Res = DAG.getBitcast(PackedVT, Res);
21130 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21131 }
21132
21133 // Recursively pack lower/upper subvectors, concat result and pack again.
21134 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21135
21136 if (PackedVT.is128BitVector()) {
21137 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21138 // type legalization.
21139 SDValue Res =
21140 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21141 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21142 }
21143
21144 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21145 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21146 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21147 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21148 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21149}
21150
21151/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21152/// e.g. trunc <8 x i32> X to <8 x i16> -->
21153/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21154/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21156 const X86Subtarget &Subtarget,
21157 SelectionDAG &DAG) {
21158 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21159 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21160}
21161
21162/// Truncate using inreg sign extension and X86ISD::PACKSS.
21164 const X86Subtarget &Subtarget,
21165 SelectionDAG &DAG) {
21166 EVT SrcVT = In.getValueType();
21167 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21168 DAG.getValueType(DstVT));
21169 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21170}
21171
21172/// Helper to determine if \p In truncated to \p DstVT has the necessary
21173/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21174/// possibly by converting a SRL node to SRA for sign extension.
21175static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21176 SDValue In, const SDLoc &DL,
21177 SelectionDAG &DAG,
21178 const X86Subtarget &Subtarget,
21179 const SDNodeFlags Flags = SDNodeFlags()) {
21180 // Requires SSE2.
21181 if (!Subtarget.hasSSE2())
21182 return SDValue();
21183
21184 EVT SrcVT = In.getValueType();
21185 EVT DstSVT = DstVT.getVectorElementType();
21186 EVT SrcSVT = SrcVT.getVectorElementType();
21187 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21188 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21189
21190 // Check we have a truncation suited for PACKSS/PACKUS.
21191 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21192 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21193 return SDValue();
21194
21195 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21196 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21197
21198 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21199 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21200 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21201 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21202 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21203 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21204 return SDValue();
21205
21206 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21207 // split this for packing.
21208 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21209 !isFreeToSplitVector(In, DAG) &&
21210 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21211 return SDValue();
21212
21213 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21214 if (Subtarget.hasAVX512() && NumStages > 1)
21215 return SDValue();
21216
21217 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21218 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21219
21220 // Truncate with PACKUS if we are truncating a vector with leading zero
21221 // bits that extend all the way to the packed/truncated value.
21222 // e.g. Masks, zext_in_reg, etc.
21223 // Pre-SSE41 we can only use PACKUSWB.
21224 KnownBits Known = DAG.computeKnownBits(In);
21225 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21226 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21227 PackOpcode = X86ISD::PACKUS;
21228 return In;
21229 }
21230
21231 // Truncate with PACKSS if we are truncating a vector with sign-bits
21232 // that extend all the way to the packed/truncated value.
21233 // e.g. Comparison result, sext_in_reg, etc.
21234 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21235
21236 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21237 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21238 // see through BITCASTs later on and combines/simplifications can't then use
21239 // it.
21240 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21241 !Subtarget.hasAVX512())
21242 return SDValue();
21243
21244 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21245 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21246 MinSignBits < NumSignBits) {
21247 PackOpcode = X86ISD::PACKSS;
21248 return In;
21249 }
21250
21251 // If we have a srl that only generates signbits that we will discard in
21252 // the truncation then we can use PACKSS by converting the srl to a sra.
21253 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21254 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21255 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
21256 if (*ShAmt == MinSignBits) {
21257 PackOpcode = X86ISD::PACKSS;
21258 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21259 }
21260 }
21261
21262 return SDValue();
21263}
21264
21265/// This function lowers a vector truncation of 'extended sign-bits' or
21266/// 'extended zero-bits' values.
21267/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21269 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21270 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21271 MVT SrcVT = In.getSimpleValueType();
21272 MVT DstSVT = DstVT.getVectorElementType();
21273 MVT SrcSVT = SrcVT.getVectorElementType();
21274 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21275 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21276 return SDValue();
21277
21278 // If the upper half of the source is undef, then attempt to split and
21279 // only truncate the lower half.
21280 if (DstVT.getSizeInBits() >= 128) {
21281 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21282 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21283 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21284 Subtarget, DAG))
21285 return widenSubVector(Res, false, Subtarget, DAG, DL,
21286 DstVT.getSizeInBits());
21287 }
21288 }
21289
21290 unsigned PackOpcode;
21291 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21292 Subtarget, Flags))
21293 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21294
21295 return SDValue();
21296}
21297
21298/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21299/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21301 const X86Subtarget &Subtarget,
21302 SelectionDAG &DAG) {
21303 MVT SrcVT = In.getSimpleValueType();
21304 MVT DstSVT = DstVT.getVectorElementType();
21305 MVT SrcSVT = SrcVT.getVectorElementType();
21306 unsigned NumElems = DstVT.getVectorNumElements();
21307 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21308 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21309 NumElems >= 8))
21310 return SDValue();
21311
21312 // SSSE3's pshufb results in less instructions in the cases below.
21313 if (Subtarget.hasSSSE3() && NumElems == 8) {
21314 if (SrcSVT == MVT::i16)
21315 return SDValue();
21316 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21317 return SDValue();
21318 }
21319
21320 // If the upper half of the source is undef, then attempt to split and
21321 // only truncate the lower half.
21322 if (DstVT.getSizeInBits() >= 128) {
21323 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21324 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21325 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21326 return widenSubVector(Res, false, Subtarget, DAG, DL,
21327 DstVT.getSizeInBits());
21328 }
21329 }
21330
21331 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21332 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21333 // truncate 2 x v4i32 to v8i16.
21334 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21335 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21336
21337 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21338 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21339
21340 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21341 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21342 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21343 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21344 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21345 }
21346
21347 return SDValue();
21348}
21349
21351 SelectionDAG &DAG,
21352 const X86Subtarget &Subtarget) {
21353 MVT VT = Op.getSimpleValueType();
21354 SDValue In = Op.getOperand(0);
21355 MVT InVT = In.getSimpleValueType();
21356 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21357
21358 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21359 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21360 if (InVT.getScalarSizeInBits() <= 16) {
21361 if (Subtarget.hasBWI()) {
21362 // legal, will go to VPMOVB2M, VPMOVW2M
21363 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21364 // We need to shift to get the lsb into sign position.
21365 // Shift packed bytes not supported natively, bitcast to word
21366 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21367 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21368 DAG.getBitcast(ExtVT, In),
21369 DAG.getConstant(ShiftInx, DL, ExtVT));
21370 In = DAG.getBitcast(InVT, In);
21371 }
21372 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21373 In, ISD::SETGT);
21374 }
21375 // Use TESTD/Q, extended vector to packed dword/qword.
21376 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21377 "Unexpected vector type.");
21378 unsigned NumElts = InVT.getVectorNumElements();
21379 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21380 // We need to change to a wider element type that we have support for.
21381 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21382 // For 16 element vectors we extend to v16i32 unless we are explicitly
21383 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21384 // we need to split into two 8 element vectors which we can extend to v8i32,
21385 // truncate and concat the results. There's an additional complication if
21386 // the original type is v16i8. In that case we can't split the v16i8
21387 // directly, so we need to shuffle high elements to low and use
21388 // sign_extend_vector_inreg.
21389 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21390 SDValue Lo, Hi;
21391 if (InVT == MVT::v16i8) {
21392 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21393 Hi = DAG.getVectorShuffle(
21394 InVT, DL, In, In,
21395 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21396 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21397 } else {
21398 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21399 Lo = extract128BitVector(In, 0, DAG, DL);
21400 Hi = extract128BitVector(In, 8, DAG, DL);
21401 }
21402 // We're split now, just emit two truncates and a concat. The two
21403 // truncates will trigger legalization to come back to this function.
21404 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21405 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21406 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21407 }
21408 // We either have 8 elements or we're allowed to use 512-bit vectors.
21409 // If we have VLX, we want to use the narrowest vector that can get the
21410 // job done so we use vXi32.
21411 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21412 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21413 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21414 InVT = ExtVT;
21415 ShiftInx = InVT.getScalarSizeInBits() - 1;
21416 }
21417
21418 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21419 // We need to shift to get the lsb into sign position.
21420 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21421 DAG.getConstant(ShiftInx, DL, InVT));
21422 }
21423 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21424 if (Subtarget.hasDQI())
21425 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21426 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21427}
21428
21429SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21430 SDLoc DL(Op);
21431 MVT VT = Op.getSimpleValueType();
21432 SDValue In = Op.getOperand(0);
21433 MVT InVT = In.getSimpleValueType();
21435 "Invalid TRUNCATE operation");
21436
21437 // If we're called by the type legalizer, handle a few cases.
21438 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21439 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21440 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21441 VT.is128BitVector() && Subtarget.hasAVX512()) {
21442 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21443 "Unexpected subtarget!");
21444 // The default behavior is to truncate one step, concatenate, and then
21445 // truncate the remainder. We'd rather produce two 64-bit results and
21446 // concatenate those.
21447 SDValue Lo, Hi;
21448 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21449
21450 EVT LoVT, HiVT;
21451 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21452
21453 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21454 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21455 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21456 }
21457
21458 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21459 if (!Subtarget.hasAVX512() ||
21460 (InVT.is512BitVector() && VT.is256BitVector()))
21462 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21463 return SignPack;
21464
21465 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21466 if (!Subtarget.hasAVX512())
21467 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21468
21469 // Otherwise let default legalization handle it.
21470 return SDValue();
21471 }
21472
21473 if (VT.getVectorElementType() == MVT::i1)
21474 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21475
21476 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21477 // concat from subvectors to use VPTRUNC etc.
21478 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21480 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21481 return SignPack;
21482
21483 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21484 if (Subtarget.hasAVX512()) {
21485 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21486 assert(VT == MVT::v32i8 && "Unexpected VT!");
21487 return splitVectorIntUnary(Op, DAG, DL);
21488 }
21489
21490 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21491 // and then truncate that. But we should only do that if we haven't been
21492 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21493 // handled by isel patterns.
21494 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21495 Subtarget.canExtendTo512DQ())
21496 return Op;
21497 }
21498
21499 // Handle truncation of V256 to V128 using shuffles.
21500 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21501
21502 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21503 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21504 if (Subtarget.hasInt256()) {
21505 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21506 In = DAG.getBitcast(MVT::v8i32, In);
21507 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21508 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21509 DAG.getVectorIdxConstant(0, DL));
21510 }
21511
21512 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21513 DAG.getVectorIdxConstant(0, DL));
21514 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21515 DAG.getVectorIdxConstant(2, DL));
21516 static const int ShufMask[] = {0, 2, 4, 6};
21517 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21518 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21519 }
21520
21521 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21522 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21523 if (Subtarget.hasInt256()) {
21524 // The PSHUFB mask:
21525 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21526 -1, -1, -1, -1, -1, -1, -1, -1,
21527 16, 17, 20, 21, 24, 25, 28, 29,
21528 -1, -1, -1, -1, -1, -1, -1, -1 };
21529 In = DAG.getBitcast(MVT::v32i8, In);
21530 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21531 In = DAG.getBitcast(MVT::v4i64, In);
21532
21533 static const int ShufMask2[] = {0, 2, -1, -1};
21534 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21535 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21536 DAG.getVectorIdxConstant(0, DL));
21537 return DAG.getBitcast(MVT::v8i16, In);
21538 }
21539
21540 return Subtarget.hasSSE41()
21541 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21542 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21543 }
21544
21545 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21546 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21547
21548 llvm_unreachable("All 256->128 cases should have been handled above!");
21549}
21550
21551// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21552// behaves on out of range inputs to generate optimized conversions.
21554 SelectionDAG &DAG,
21555 const X86Subtarget &Subtarget) {
21556 MVT SrcVT = Src.getSimpleValueType();
21557 unsigned DstBits = VT.getScalarSizeInBits();
21558 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21559
21560 // Calculate the converted result for values in the range 0 to
21561 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21562 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21563 SDValue Big =
21564 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21565 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21566 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21567
21568 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21569 // and only if the value was out of range. So we can use that
21570 // as our indicator that we rather use "Big" instead of "Small".
21571 //
21572 // Use "Small" if "IsOverflown" has all bits cleared
21573 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21574
21575 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21576 // use the slightly slower blendv select instead.
21577 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21578 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21579 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21580 }
21581
21582 SDValue IsOverflown =
21583 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21584 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21585 return DAG.getNode(ISD::OR, dl, VT, Small,
21586 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21587}
21588
21589SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21590 bool IsStrict = Op->isStrictFPOpcode();
21591 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21592 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21593 bool HasVLX = Subtarget.hasVLX();
21594 MVT VT = Op->getSimpleValueType(0);
21595 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21596 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21597 MVT SrcVT = Src.getSimpleValueType();
21598 SDLoc dl(Op);
21599
21600 SDValue Res;
21601 if (isSoftF16(SrcVT, Subtarget)) {
21602 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21603 if (IsStrict)
21604 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21605 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21606 {NVT, MVT::Other}, {Chain, Src})});
21607 return DAG.getNode(Op.getOpcode(), dl, VT,
21608 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21609 } else if (isTypeLegal(SrcVT) &&
21610 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21611 return Op;
21612 }
21613
21614 if (VT.isVector()) {
21615 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21616 MVT ResVT = MVT::v4i32;
21617 MVT TruncVT = MVT::v4i1;
21618 unsigned Opc;
21619 if (IsStrict)
21621 else
21622 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21623
21624 if (!IsSigned && !HasVLX) {
21625 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21626 // Widen to 512-bits.
21627 ResVT = MVT::v8i32;
21628 TruncVT = MVT::v8i1;
21629 Opc = Op.getOpcode();
21630 // Need to concat with zero vector for strict fp to avoid spurious
21631 // exceptions.
21632 // TODO: Should we just do this for non-strict as well?
21633 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21634 : DAG.getUNDEF(MVT::v8f64);
21635 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21636 DAG.getVectorIdxConstant(0, dl));
21637 }
21638 if (IsStrict) {
21639 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21640 Chain = Res.getValue(1);
21641 } else {
21642 Res = DAG.getNode(Opc, dl, ResVT, Src);
21643 }
21644
21645 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21646 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21647 DAG.getVectorIdxConstant(0, dl));
21648 if (IsStrict)
21649 return DAG.getMergeValues({Res, Chain}, dl);
21650 return Res;
21651 }
21652
21653 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21654 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21655 VT == MVT::v32i16)
21656 return Op;
21657
21658 MVT ResVT = VT;
21659 MVT EleVT = VT.getVectorElementType();
21660 if (EleVT != MVT::i64)
21661 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21662
21663 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21664 SDValue Tmp =
21665 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21666 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21667 Ops[0] = Src;
21668 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21669 }
21670
21671 if (!HasVLX) {
21672 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21673 // Widen to 512-bits.
21674 unsigned IntSize = EleVT.getSizeInBits();
21675 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21676 ResVT = MVT::getVectorVT(EleVT, Num);
21677 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21678 Subtarget, DAG, dl);
21679 }
21680
21681 if (IsStrict) {
21682 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21684 dl, {ResVT, MVT::Other}, {Chain, Src});
21685 Chain = Res.getValue(1);
21686 } else {
21687 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21688 ResVT, Src);
21689 }
21690
21691 // TODO: Need to add exception check code for strict FP.
21692 if (EleVT.getSizeInBits() < 16) {
21693 if (HasVLX)
21694 ResVT = MVT::getVectorVT(EleVT, 8);
21695 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21696 }
21697
21698 if (ResVT != VT)
21699 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21700 DAG.getVectorIdxConstant(0, dl));
21701
21702 if (IsStrict)
21703 return DAG.getMergeValues({Res, Chain}, dl);
21704 return Res;
21705 }
21706
21707 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21708 if (VT.getVectorElementType() == MVT::i16) {
21709 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21710 SrcVT.getVectorElementType() == MVT::f64) &&
21711 "Expected f32/f64 vector!");
21712 MVT NVT = VT.changeVectorElementType(MVT::i32);
21713 if (IsStrict) {
21714 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21716 dl, {NVT, MVT::Other}, {Chain, Src});
21717 Chain = Res.getValue(1);
21718 } else {
21719 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21720 NVT, Src);
21721 }
21722
21723 // TODO: Need to add exception check code for strict FP.
21724 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21725
21726 if (IsStrict)
21727 return DAG.getMergeValues({Res, Chain}, dl);
21728 return Res;
21729 }
21730
21731 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21732 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21733 assert(!IsSigned && "Expected unsigned conversion!");
21734 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21735 return Op;
21736 }
21737
21738 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21739 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21740 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21741 Subtarget.useAVX512Regs()) {
21742 assert(!IsSigned && "Expected unsigned conversion!");
21743 assert(!Subtarget.hasVLX() && "Unexpected features!");
21744 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21745 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21746 // Need to concat with zero vector for strict fp to avoid spurious
21747 // exceptions.
21748 // TODO: Should we just do this for non-strict as well?
21749 SDValue Tmp =
21750 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21751 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21752 DAG.getVectorIdxConstant(0, dl));
21753
21754 if (IsStrict) {
21755 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21756 {Chain, Src});
21757 Chain = Res.getValue(1);
21758 } else {
21759 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21760 }
21761
21762 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21763 DAG.getVectorIdxConstant(0, dl));
21764
21765 if (IsStrict)
21766 return DAG.getMergeValues({Res, Chain}, dl);
21767 return Res;
21768 }
21769
21770 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21771 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21772 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21773 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21774 assert(!Subtarget.hasVLX() && "Unexpected features!");
21775 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21776 // Need to concat with zero vector for strict fp to avoid spurious
21777 // exceptions.
21778 // TODO: Should we just do this for non-strict as well?
21779 SDValue Tmp =
21780 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21781 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21782 DAG.getVectorIdxConstant(0, dl));
21783
21784 if (IsStrict) {
21785 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21786 {Chain, Src});
21787 Chain = Res.getValue(1);
21788 } else {
21789 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21790 }
21791
21792 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21793 DAG.getVectorIdxConstant(0, dl));
21794
21795 if (IsStrict)
21796 return DAG.getMergeValues({Res, Chain}, dl);
21797 return Res;
21798 }
21799
21800 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21801 if (!Subtarget.hasVLX()) {
21802 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21803 // legalizer and then widened again by vector op legalization.
21804 if (!IsStrict)
21805 return SDValue();
21806
21807 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21808 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21809 {Src, Zero, Zero, Zero});
21810 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21811 {Chain, Tmp});
21812 SDValue Chain = Tmp.getValue(1);
21813 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21814 DAG.getVectorIdxConstant(0, dl));
21815 return DAG.getMergeValues({Tmp, Chain}, dl);
21816 }
21817
21818 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21819 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21820 DAG.getUNDEF(MVT::v2f32));
21821 if (IsStrict) {
21822 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21824 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21825 }
21826 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21827 return DAG.getNode(Opc, dl, VT, Tmp);
21828 }
21829
21830 // Generate optimized instructions for pre AVX512 unsigned conversions from
21831 // vXf32 to vXi32.
21832 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21833 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21834 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21835 assert(!IsSigned && "Expected unsigned conversion!");
21836 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21837 }
21838
21839 return SDValue();
21840 }
21841
21842 assert(!VT.isVector());
21843
21844 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21845
21846 if (!IsSigned && UseSSEReg) {
21847 // Conversions from f32/f64 with AVX512 should be legal.
21848 if (Subtarget.hasAVX512())
21849 return Op;
21850
21851 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21852 // behaves on out of range inputs to generate optimized conversions.
21853 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21854 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21855 unsigned DstBits = VT.getScalarSizeInBits();
21856 APInt UIntLimit = APInt::getSignMask(DstBits);
21857 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21858 DAG.getConstant(UIntLimit, dl, VT));
21859 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21860
21861 // Calculate the converted result for values in the range:
21862 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21863 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21864 SDValue Small =
21865 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21866 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21867 SDValue Big = DAG.getNode(
21868 X86ISD::CVTTS2SI, dl, VT,
21869 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21870 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21871
21872 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21873 // and only if the value was out of range. So we can use that
21874 // as our indicator that we rather use "Big" instead of "Small".
21875 //
21876 // Use "Small" if "IsOverflown" has all bits cleared
21877 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21878 SDValue IsOverflown = DAG.getNode(
21879 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21880 return DAG.getNode(ISD::OR, dl, VT, Small,
21881 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21882 }
21883
21884 // Use default expansion for i64.
21885 if (VT == MVT::i64)
21886 return SDValue();
21887
21888 assert(VT == MVT::i32 && "Unexpected VT!");
21889
21890 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21891 // FIXME: This does not generate an invalid exception if the input does not
21892 // fit in i32. PR44019
21893 if (Subtarget.is64Bit()) {
21894 if (IsStrict) {
21895 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21896 {Chain, Src});
21897 Chain = Res.getValue(1);
21898 } else
21899 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21900
21901 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21902 if (IsStrict)
21903 return DAG.getMergeValues({Res, Chain}, dl);
21904 return Res;
21905 }
21906
21907 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21908 // use fisttp which will be handled later.
21909 if (!Subtarget.hasSSE3())
21910 return SDValue();
21911 }
21912
21913 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21914 // FIXME: This does not generate an invalid exception if the input does not
21915 // fit in i16. PR44019
21916 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21917 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21918 if (IsStrict) {
21919 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21920 {Chain, Src});
21921 Chain = Res.getValue(1);
21922 } else
21923 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21924
21925 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21926 if (IsStrict)
21927 return DAG.getMergeValues({Res, Chain}, dl);
21928 return Res;
21929 }
21930
21931 // If this is a FP_TO_SINT using SSEReg we're done.
21932 if (UseSSEReg && IsSigned)
21933 return Op;
21934
21935 // fp128 needs to use a libcall.
21936 if (SrcVT == MVT::f128) {
21937 RTLIB::Libcall LC;
21938 if (IsSigned)
21939 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21940 else
21941 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21942
21943 MakeLibCallOptions CallOptions;
21944 std::pair<SDValue, SDValue> Tmp =
21945 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21946
21947 if (IsStrict)
21948 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21949
21950 return Tmp.first;
21951 }
21952
21953 // Fall back to X87.
21954 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21955 if (IsStrict)
21956 return DAG.getMergeValues({V, Chain}, dl);
21957 return V;
21958 }
21959
21960 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21961}
21962
21963SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21964 SelectionDAG &DAG) const {
21965 SDValue Src = Op.getOperand(0);
21966 EVT DstVT = Op.getSimpleValueType();
21967 MVT SrcVT = Src.getSimpleValueType();
21968
21969 if (SrcVT.isVector())
21970 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21971
21972 if (SrcVT == MVT::f16)
21973 return SDValue();
21974
21975 // If the source is in an SSE register, the node is Legal.
21976 if (isScalarFPTypeInSSEReg(SrcVT))
21977 return Op;
21978
21979 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21980}
21981
21982SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21983 SelectionDAG &DAG) const {
21984 EVT DstVT = N->getValueType(0);
21985 SDValue Src = N->getOperand(0);
21986 EVT SrcVT = Src.getValueType();
21987
21988 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21989 // f16 must be promoted before using the lowering in this routine.
21990 // fp128 does not use this lowering.
21991 return SDValue();
21992 }
21993
21994 SDLoc DL(N);
21995 SDValue Chain = DAG.getEntryNode();
21996
21997 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21998
21999 // If we're converting from SSE, the stack slot needs to hold both types.
22000 // Otherwise it only needs to hold the DstVT.
22001 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22002 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22003 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22004 MachinePointerInfo MPI =
22006
22007 if (UseSSE) {
22008 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22009 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22010 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22011 SDValue Ops[] = { Chain, StackPtr };
22012
22013 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22014 /*Align*/ std::nullopt,
22016 Chain = Src.getValue(1);
22017 }
22018
22019 SDValue StoreOps[] = { Chain, Src, StackPtr };
22020 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22021 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22023
22024 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22025}
22026
22027SDValue
22028X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22029 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22030 // but making use of X86 specifics to produce better instruction sequences.
22031 SDNode *Node = Op.getNode();
22032 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22033 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22034 SDLoc dl(SDValue(Node, 0));
22035 SDValue Src = Node->getOperand(0);
22036
22037 // There are three types involved here: SrcVT is the source floating point
22038 // type, DstVT is the type of the result, and TmpVT is the result of the
22039 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22040 // DstVT).
22041 EVT SrcVT = Src.getValueType();
22042 EVT DstVT = Node->getValueType(0);
22043 EVT TmpVT = DstVT;
22044
22045 // This code is only for floats and doubles. Fall back to generic code for
22046 // anything else.
22047 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22048 return SDValue();
22049
22050 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22051 unsigned SatWidth = SatVT.getScalarSizeInBits();
22052 unsigned DstWidth = DstVT.getScalarSizeInBits();
22053 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22054 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22055 "Expected saturation width smaller than result width");
22056
22057 // Promote result of FP_TO_*INT to at least 32 bits.
22058 if (TmpWidth < 32) {
22059 TmpVT = MVT::i32;
22060 TmpWidth = 32;
22061 }
22062
22063 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22064 // us to use a native signed conversion instead.
22065 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22066 TmpVT = MVT::i64;
22067 TmpWidth = 64;
22068 }
22069
22070 // If the saturation width is smaller than the size of the temporary result,
22071 // we can always use signed conversion, which is native.
22072 if (SatWidth < TmpWidth)
22073 FpToIntOpcode = ISD::FP_TO_SINT;
22074
22075 // Determine minimum and maximum integer values and their corresponding
22076 // floating-point values.
22077 APInt MinInt, MaxInt;
22078 if (IsSigned) {
22079 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22080 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22081 } else {
22082 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22083 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22084 }
22085
22086 const fltSemantics &Sem = SrcVT.getFltSemantics();
22087 APFloat MinFloat(Sem);
22088 APFloat MaxFloat(Sem);
22089
22090 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22091 MinInt, IsSigned, APFloat::rmTowardZero);
22092 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22093 MaxInt, IsSigned, APFloat::rmTowardZero);
22094 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22095 && !(MaxStatus & APFloat::opStatus::opInexact);
22096
22097 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22098 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22099
22100 // If the integer bounds are exactly representable as floats, emit a
22101 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22102 if (AreExactFloatBounds) {
22103 if (DstVT != TmpVT) {
22104 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22105 SDValue MinClamped = DAG.getNode(
22106 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22107 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22108 SDValue BothClamped = DAG.getNode(
22109 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22110 // Convert clamped value to integer.
22111 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22112
22113 // NaN will become INDVAL, with the top bit set and the rest zero.
22114 // Truncation will discard the top bit, resulting in zero.
22115 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22116 }
22117
22118 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22119 SDValue MinClamped = DAG.getNode(
22120 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22121 // Clamp by MaxFloat from above. NaN cannot occur.
22122 SDValue BothClamped = DAG.getNode(
22123 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22124 // Convert clamped value to integer.
22125 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22126
22127 if (!IsSigned) {
22128 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22129 // which is zero.
22130 return FpToInt;
22131 }
22132
22133 // Otherwise, select zero if Src is NaN.
22134 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22135 return DAG.getSelectCC(
22136 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22137 }
22138
22139 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22140 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22141
22142 // Result of direct conversion, which may be selected away.
22143 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22144
22145 if (DstVT != TmpVT) {
22146 // NaN will become INDVAL, with the top bit set and the rest zero.
22147 // Truncation will discard the top bit, resulting in zero.
22148 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22149 }
22150
22151 SDValue Select = FpToInt;
22152 // For signed conversions where we saturate to the same size as the
22153 // result type of the fptoi instructions, INDVAL coincides with integer
22154 // minimum, so we don't need to explicitly check it.
22155 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22156 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22157 // MinInt if Src is NaN.
22158 Select = DAG.getSelectCC(
22159 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22160 }
22161
22162 // If Src OGT MaxFloat, select MaxInt.
22163 Select = DAG.getSelectCC(
22164 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22165
22166 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22167 // is already zero. The promoted case was already handled above.
22168 if (!IsSigned || DstVT != TmpVT) {
22169 return Select;
22170 }
22171
22172 // Otherwise, select 0 if Src is NaN.
22173 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22174 return DAG.getSelectCC(
22175 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22176}
22177
22178SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22179 bool IsStrict = Op->isStrictFPOpcode();
22180
22181 SDLoc DL(Op);
22182 MVT VT = Op.getSimpleValueType();
22183 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22184 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22185 MVT SVT = In.getSimpleValueType();
22186
22187 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22188 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22189 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22190 !Subtarget.getTargetTriple().isOSDarwin()))
22191 return SDValue();
22192
22193 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22194 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22195 return Op;
22196
22197 if (SVT == MVT::f16) {
22198 if (Subtarget.hasFP16())
22199 return Op;
22200
22201 if (VT != MVT::f32) {
22202 if (IsStrict)
22203 return DAG.getNode(
22204 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22205 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22206 {MVT::f32, MVT::Other}, {Chain, In})});
22207
22208 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22209 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22210 }
22211
22212 if (!Subtarget.hasF16C()) {
22213 if (!Subtarget.getTargetTriple().isOSDarwin())
22214 return SDValue();
22215
22216 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22217
22218 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22220 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22221
22222 In = DAG.getBitcast(MVT::i16, In);
22225 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22226 Entry.IsSExt = false;
22227 Entry.IsZExt = true;
22228 Args.push_back(Entry);
22229
22231 getLibcallName(RTLIB::FPEXT_F16_F32),
22233 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22234 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22235 std::move(Args));
22236
22237 SDValue Res;
22238 std::tie(Res,Chain) = LowerCallTo(CLI);
22239 if (IsStrict)
22240 Res = DAG.getMergeValues({Res, Chain}, DL);
22241
22242 return Res;
22243 }
22244
22245 In = DAG.getBitcast(MVT::i16, In);
22246 SDValue Res;
22247 if (IsStrict) {
22248 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22249 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22250 DAG.getVectorIdxConstant(0, DL));
22251 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22252 {Chain, In});
22253 Chain = Res.getValue(1);
22254 } else {
22255 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22256 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22257 DAG.getUNDEF(MVT::v4i32), In,
22258 DAG.getVectorIdxConstant(0, DL));
22259 In = DAG.getBitcast(MVT::v8i16, In);
22260 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22261 DAG.getTargetConstant(4, DL, MVT::i32));
22262 }
22263 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22264 DAG.getVectorIdxConstant(0, DL));
22265 if (IsStrict)
22266 return DAG.getMergeValues({Res, Chain}, DL);
22267 return Res;
22268 }
22269
22270 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22271 return Op;
22272
22273 if (SVT.getVectorElementType() == MVT::f16) {
22274 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22275 return Op;
22276 assert(Subtarget.hasF16C() && "Unexpected features!");
22277 if (SVT == MVT::v2f16)
22278 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22279 DAG.getUNDEF(MVT::v2f16));
22280 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22281 DAG.getUNDEF(MVT::v4f16));
22282 if (IsStrict)
22283 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22284 {Op->getOperand(0), Res});
22285 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22286 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22287 return Op;
22288 }
22289
22290 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22291
22292 SDValue Res =
22293 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22294 if (IsStrict)
22295 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22296 {Op->getOperand(0), Res});
22297 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22298}
22299
22300SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22301 bool IsStrict = Op->isStrictFPOpcode();
22302
22303 SDLoc DL(Op);
22304 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22305 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22306 MVT VT = Op.getSimpleValueType();
22307 MVT SVT = In.getSimpleValueType();
22308
22309 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22310 return SDValue();
22311
22312 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22313 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22314 if (!Subtarget.getTargetTriple().isOSDarwin())
22315 return SDValue();
22316
22317 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22319 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22320
22323 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22324 Entry.IsSExt = false;
22325 Entry.IsZExt = true;
22326 Args.push_back(Entry);
22327
22329 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22330 : RTLIB::FPROUND_F32_F16),
22332 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22333 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22334 std::move(Args));
22335
22336 SDValue Res;
22337 std::tie(Res, Chain) = LowerCallTo(CLI);
22338
22339 Res = DAG.getBitcast(MVT::f16, Res);
22340
22341 if (IsStrict)
22342 Res = DAG.getMergeValues({Res, Chain}, DL);
22343
22344 return Res;
22345 }
22346
22347 if (VT.getScalarType() == MVT::bf16) {
22348 if (SVT.getScalarType() == MVT::f32 &&
22349 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22350 Subtarget.hasAVXNECONVERT()))
22351 return Op;
22352 return SDValue();
22353 }
22354
22355 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22356 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22357 return SDValue();
22358
22359 if (VT.isVector())
22360 return Op;
22361
22362 SDValue Res;
22364 MVT::i32);
22365 if (IsStrict) {
22366 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22367 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22368 DAG.getVectorIdxConstant(0, DL));
22369 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22370 {Chain, Res, Rnd});
22371 Chain = Res.getValue(1);
22372 } else {
22373 // FIXME: Should we use zeros for upper elements for non-strict?
22374 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22375 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22376 }
22377
22378 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22379 DAG.getVectorIdxConstant(0, DL));
22380 Res = DAG.getBitcast(MVT::f16, Res);
22381
22382 if (IsStrict)
22383 return DAG.getMergeValues({Res, Chain}, DL);
22384
22385 return Res;
22386 }
22387
22388 return Op;
22389}
22390
22392 bool IsStrict = Op->isStrictFPOpcode();
22393 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22394 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22395 "Unexpected VT!");
22396
22397 SDLoc dl(Op);
22398 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22399 DAG.getConstant(0, dl, MVT::v8i16), Src,
22400 DAG.getVectorIdxConstant(0, dl));
22401
22402 SDValue Chain;
22403 if (IsStrict) {
22404 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22405 {Op.getOperand(0), Res});
22406 Chain = Res.getValue(1);
22407 } else {
22408 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22409 }
22410
22411 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22412 DAG.getVectorIdxConstant(0, dl));
22413
22414 if (IsStrict)
22415 return DAG.getMergeValues({Res, Chain}, dl);
22416
22417 return Res;
22418}
22419
22421 bool IsStrict = Op->isStrictFPOpcode();
22422 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22423 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22424 "Unexpected VT!");
22425
22426 SDLoc dl(Op);
22427 SDValue Res, Chain;
22428 if (IsStrict) {
22429 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22430 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22431 DAG.getVectorIdxConstant(0, dl));
22432 Res = DAG.getNode(
22433 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22434 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22435 Chain = Res.getValue(1);
22436 } else {
22437 // FIXME: Should we use zeros for upper elements for non-strict?
22438 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22439 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22440 DAG.getTargetConstant(4, dl, MVT::i32));
22441 }
22442
22443 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22444 DAG.getVectorIdxConstant(0, dl));
22445
22446 if (IsStrict)
22447 return DAG.getMergeValues({Res, Chain}, dl);
22448
22449 return Res;
22450}
22451
22452SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22453 SelectionDAG &DAG) const {
22454 SDLoc DL(Op);
22455
22456 MVT SVT = Op.getOperand(0).getSimpleValueType();
22457 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22458 Subtarget.hasAVXNECONVERT())) {
22459 SDValue Res;
22460 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22461 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22462 Res = DAG.getBitcast(MVT::v8i16, Res);
22463 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22464 DAG.getVectorIdxConstant(0, DL));
22465 }
22466
22467 MakeLibCallOptions CallOptions;
22468 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22469 SDValue Res =
22470 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22471 return DAG.getBitcast(MVT::i16, Res);
22472}
22473
22474/// Depending on uarch and/or optimizing for size, we might prefer to use a
22475/// vector operation in place of the typical scalar operation.
22477 SelectionDAG &DAG,
22478 const X86Subtarget &Subtarget) {
22479 // If both operands have other uses, this is probably not profitable.
22480 SDValue LHS = Op.getOperand(0);
22481 SDValue RHS = Op.getOperand(1);
22482 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22483 return Op;
22484
22485 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22486 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22487 if (IsFP && !Subtarget.hasSSE3())
22488 return Op;
22489 if (!IsFP && !Subtarget.hasSSSE3())
22490 return Op;
22491
22492 // Extract from a common vector.
22493 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22494 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22495 LHS.getOperand(0) != RHS.getOperand(0) ||
22496 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22497 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22498 !shouldUseHorizontalOp(true, DAG, Subtarget))
22499 return Op;
22500
22501 // Allow commuted 'hadd' ops.
22502 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22503 unsigned HOpcode;
22504 switch (Op.getOpcode()) {
22505 // clang-format off
22506 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22507 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22508 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22509 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22510 default:
22511 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22512 // clang-format on
22513 }
22514 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22515 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22516 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22517 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22518 std::swap(LExtIndex, RExtIndex);
22519
22520 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22521 return Op;
22522
22523 SDValue X = LHS.getOperand(0);
22524 EVT VecVT = X.getValueType();
22525 unsigned BitWidth = VecVT.getSizeInBits();
22526 unsigned NumLanes = BitWidth / 128;
22527 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22528 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22529 "Not expecting illegal vector widths here");
22530
22531 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22532 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22533 if (BitWidth == 256 || BitWidth == 512) {
22534 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22535 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22536 LExtIndex %= NumEltsPerLane;
22537 }
22538
22539 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22540 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22541 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22542 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22543 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22544 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22545 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22546}
22547
22548/// Depending on uarch and/or optimizing for size, we might prefer to use a
22549/// vector operation in place of the typical scalar operation.
22550SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22551 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22552 "Only expecting float/double");
22553 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22554}
22555
22556/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22557/// This mode isn't supported in hardware on X86. But as long as we aren't
22558/// compiling with trapping math, we can emulate this with
22559/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22561 SDValue N0 = Op.getOperand(0);
22562 SDLoc dl(Op);
22563 MVT VT = Op.getSimpleValueType();
22564
22565 // N0 += copysign(nextafter(0.5, 0.0), N0)
22566 const fltSemantics &Sem = VT.getFltSemantics();
22567 bool Ignored;
22568 APFloat Point5Pred = APFloat(0.5f);
22569 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22570 Point5Pred.next(/*nextDown*/true);
22571
22572 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22573 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22574 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22575
22576 // Truncate the result to remove fraction.
22577 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22578}
22579
22580/// The only differences between FABS and FNEG are the mask and the logic op.
22581/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22583 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22584 "Wrong opcode for lowering FABS or FNEG.");
22585
22586 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22587
22588 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22589 // into an FNABS. We'll lower the FABS after that if it is still in use.
22590 if (IsFABS)
22591 for (SDNode *User : Op->users())
22592 if (User->getOpcode() == ISD::FNEG)
22593 return Op;
22594
22595 SDLoc dl(Op);
22596 MVT VT = Op.getSimpleValueType();
22597
22598 bool IsF128 = (VT == MVT::f128);
22599 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22601 "Unexpected type in LowerFABSorFNEG");
22602
22603 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22604 // decide if we should generate a 16-byte constant mask when we only need 4 or
22605 // 8 bytes for the scalar case.
22606
22607 // There are no scalar bitwise logical SSE/AVX instructions, so we
22608 // generate a 16-byte vector constant and logic op even for the scalar case.
22609 // Using a 16-byte mask allows folding the load of the mask with
22610 // the logic op, so it can save (~4 bytes) on code size.
22611 bool IsFakeVector = !VT.isVector() && !IsF128;
22612 MVT LogicVT = VT;
22613 if (IsFakeVector)
22614 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22615 : (VT == MVT::f32) ? MVT::v4f32
22616 : MVT::v8f16;
22617
22618 unsigned EltBits = VT.getScalarSizeInBits();
22619 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22620 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22621 APInt::getSignMask(EltBits);
22622 const fltSemantics &Sem = VT.getFltSemantics();
22623 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22624
22625 SDValue Op0 = Op.getOperand(0);
22626 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22627 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22628 IsFNABS ? X86ISD::FOR :
22630 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22631
22632 if (VT.isVector() || IsF128)
22633 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22634
22635 // For the scalar case extend to a 128-bit vector, perform the logic op,
22636 // and extract the scalar result back out.
22637 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22638 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22639 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22640 DAG.getVectorIdxConstant(0, dl));
22641}
22642
22644 SDValue Mag = Op.getOperand(0);
22645 SDValue Sign = Op.getOperand(1);
22646 SDLoc dl(Op);
22647
22648 // If the sign operand is smaller, extend it first.
22649 MVT VT = Op.getSimpleValueType();
22650 if (Sign.getSimpleValueType().bitsLT(VT))
22651 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22652
22653 // And if it is bigger, shrink it first.
22654 if (Sign.getSimpleValueType().bitsGT(VT))
22655 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22656 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22657
22658 // At this point the operands and the result should have the same
22659 // type, and that won't be f80 since that is not custom lowered.
22660 bool IsF128 = (VT == MVT::f128);
22661 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22663 "Unexpected type in LowerFCOPYSIGN");
22664
22665 const fltSemantics &Sem = VT.getFltSemantics();
22666
22667 // Perform all scalar logic operations as 16-byte vectors because there are no
22668 // scalar FP logic instructions in SSE.
22669 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22670 // unnecessary splats, but we might miss load folding opportunities. Should
22671 // this decision be based on OptimizeForSize?
22672 bool IsFakeVector = !VT.isVector() && !IsF128;
22673 MVT LogicVT = VT;
22674 if (IsFakeVector)
22675 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22676 : (VT == MVT::f32) ? MVT::v4f32
22677 : MVT::v8f16;
22678
22679 // The mask constants are automatically splatted for vector types.
22680 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22681 SDValue SignMask = DAG.getConstantFP(
22682 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22683 SDValue MagMask = DAG.getConstantFP(
22684 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22685
22686 // First, clear all bits but the sign bit from the second operand (sign).
22687 if (IsFakeVector)
22688 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22689 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22690
22691 // Next, clear the sign bit from the first operand (magnitude).
22692 // TODO: If we had general constant folding for FP logic ops, this check
22693 // wouldn't be necessary.
22694 SDValue MagBits;
22695 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22696 APFloat APF = Op0CN->getValueAPF();
22697 APF.clearSign();
22698 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22699 } else {
22700 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22701 if (IsFakeVector)
22702 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22703 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22704 }
22705
22706 // OR the magnitude value with the sign bit.
22707 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22708 return !IsFakeVector ? Or
22709 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22710 DAG.getVectorIdxConstant(0, dl));
22711}
22712
22714 SDValue N0 = Op.getOperand(0);
22715 SDLoc dl(Op);
22716 MVT VT = Op.getSimpleValueType();
22717
22718 MVT OpVT = N0.getSimpleValueType();
22719 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22720 "Unexpected type for FGETSIGN");
22721
22722 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22723 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22724 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22725 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22726 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22727 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22728 return Res;
22729}
22730
22731/// Helper for attempting to create a X86ISD::BT node.
22732static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22733 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22734 // instruction. Since the shift amount is in-range-or-undefined, we know
22735 // that doing a bittest on the i32 value is ok. We extend to i32 because
22736 // the encoding for the i16 version is larger than the i32 version.
22737 // Also promote i16 to i32 for performance / code size reason.
22738 if (Src.getValueType().getScalarSizeInBits() < 32)
22739 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22740
22741 // No legal type found, give up.
22742 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22743 return SDValue();
22744
22745 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22746 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22747 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22748 // known to be zero.
22749 if (Src.getValueType() == MVT::i64 &&
22750 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22751 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22752
22753 // If the operand types disagree, extend the shift amount to match. Since
22754 // BT ignores high bits (like shifts) we can use anyextend.
22755 if (Src.getValueType() != BitNo.getValueType()) {
22756 // Peek through a mask/modulo operation.
22757 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22758 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22759 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22760 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22761 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22762 BitNo.getOperand(0)),
22763 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22764 BitNo.getOperand(1)));
22765 else
22766 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22767 }
22768
22769 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22770}
22771
22772/// Helper for creating a X86ISD::SETCC node.
22774 SelectionDAG &DAG) {
22775 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22776 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22777}
22778
22779/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22780/// recognizable memcmp expansion.
22781static bool isOrXorXorTree(SDValue X, bool Root = true) {
22782 if (X.getOpcode() == ISD::OR)
22783 return isOrXorXorTree(X.getOperand(0), false) &&
22784 isOrXorXorTree(X.getOperand(1), false);
22785 if (Root)
22786 return false;
22787 return X.getOpcode() == ISD::XOR;
22788}
22789
22790/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22791/// expansion.
22792template <typename F>
22794 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22795 SDValue Op0 = X.getOperand(0);
22796 SDValue Op1 = X.getOperand(1);
22797 if (X.getOpcode() == ISD::OR) {
22798 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22799 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22800 if (VecVT != CmpVT)
22801 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22802 if (HasPT)
22803 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22804 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22805 }
22806 if (X.getOpcode() == ISD::XOR) {
22807 SDValue A = SToV(Op0);
22808 SDValue B = SToV(Op1);
22809 if (VecVT != CmpVT)
22810 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22811 if (HasPT)
22812 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22813 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22814 }
22815 llvm_unreachable("Impossible");
22816}
22817
22818/// Try to map a 128-bit or larger integer comparison to vector instructions
22819/// before type legalization splits it up into chunks.
22821 ISD::CondCode CC,
22822 const SDLoc &DL,
22823 SelectionDAG &DAG,
22824 const X86Subtarget &Subtarget) {
22825 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22826
22827 // We're looking for an oversized integer equality comparison.
22828 EVT OpVT = X.getValueType();
22829 unsigned OpSize = OpVT.getSizeInBits();
22830 if (!OpVT.isScalarInteger() || OpSize < 128)
22831 return SDValue();
22832
22833 // Ignore a comparison with zero because that gets special treatment in
22834 // EmitTest(). But make an exception for the special case of a pair of
22835 // logically-combined vector-sized operands compared to zero. This pattern may
22836 // be generated by the memcmp expansion pass with oversized integer compares
22837 // (see PR33325).
22838 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22839 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22840 return SDValue();
22841
22842 // Don't perform this combine if constructing the vector will be expensive.
22843 auto IsVectorBitCastCheap = [](SDValue X) {
22845 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22846 X.getOpcode() == ISD::LOAD;
22847 };
22848 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22849 !IsOrXorXorTreeCCZero)
22850 return SDValue();
22851
22852 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22853 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22854 // Otherwise use PCMPEQ (plus AND) and mask testing.
22855 bool NoImplicitFloatOps =
22857 Attribute::NoImplicitFloat);
22858 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22859 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22860 (OpSize == 256 && Subtarget.hasAVX()) ||
22861 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22862 bool HasPT = Subtarget.hasSSE41();
22863
22864 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22865 // vector registers are essentially free. (Technically, widening registers
22866 // prevents load folding, but the tradeoff is worth it.)
22867 bool PreferKOT = Subtarget.preferMaskRegisters();
22868 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22869
22870 EVT VecVT = MVT::v16i8;
22871 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22872 if (OpSize == 256) {
22873 VecVT = MVT::v32i8;
22874 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22875 }
22876 EVT CastVT = VecVT;
22877 bool NeedsAVX512FCast = false;
22878 if (OpSize == 512 || NeedZExt) {
22879 if (Subtarget.hasBWI()) {
22880 VecVT = MVT::v64i8;
22881 CmpVT = MVT::v64i1;
22882 if (OpSize == 512)
22883 CastVT = VecVT;
22884 } else {
22885 VecVT = MVT::v16i32;
22886 CmpVT = MVT::v16i1;
22887 CastVT = OpSize == 512 ? VecVT
22888 : OpSize == 256 ? MVT::v8i32
22889 : MVT::v4i32;
22890 NeedsAVX512FCast = true;
22891 }
22892 }
22893
22894 auto ScalarToVector = [&](SDValue X) -> SDValue {
22895 bool TmpZext = false;
22896 EVT TmpCastVT = CastVT;
22897 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22898 SDValue OrigX = X.getOperand(0);
22899 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22900 if (OrigSize < OpSize) {
22901 if (OrigSize == 128) {
22902 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22903 X = OrigX;
22904 TmpZext = true;
22905 } else if (OrigSize == 256) {
22906 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22907 X = OrigX;
22908 TmpZext = true;
22909 }
22910 }
22911 }
22912 X = DAG.getBitcast(TmpCastVT, X);
22913 if (!NeedZExt && !TmpZext)
22914 return X;
22915 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22916 DAG.getConstant(0, DL, VecVT), X,
22917 DAG.getVectorIdxConstant(0, DL));
22918 };
22919
22920 SDValue Cmp;
22921 if (IsOrXorXorTreeCCZero) {
22922 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22923 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22924 // Use 2 vector equality compares and 'and' the results before doing a
22925 // MOVMSK.
22926 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22927 } else {
22928 SDValue VecX = ScalarToVector(X);
22929 SDValue VecY = ScalarToVector(Y);
22930 if (VecVT != CmpVT) {
22931 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22932 } else if (HasPT) {
22933 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22934 } else {
22935 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22936 }
22937 }
22938 // AVX512 should emit a setcc that will lower to kortest.
22939 if (VecVT != CmpVT) {
22940 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22941 : CmpVT == MVT::v32i1 ? MVT::i32
22942 : MVT::i16;
22943 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22944 DAG.getConstant(0, DL, KRegVT), CC);
22945 }
22946 if (HasPT) {
22947 SDValue BCCmp =
22948 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22949 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22951 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22952 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22953 }
22954 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22955 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22956 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22957 assert(Cmp.getValueType() == MVT::v16i8 &&
22958 "Non 128-bit vector on pre-SSE41 target");
22959 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22960 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22961 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22962 }
22963
22964 return SDValue();
22965}
22966
22967/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22968/// style scalarized (associative) reduction patterns. Partial reductions
22969/// are supported when the pointer SrcMask is non-null.
22970/// TODO - move this to SelectionDAG?
22973 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22975 DenseMap<SDValue, APInt> SrcOpMap;
22976 EVT VT = MVT::Other;
22977
22978 // Recognize a special case where a vector is casted into wide integer to
22979 // test all 0s.
22980 assert(Op.getOpcode() == unsigned(BinOp) &&
22981 "Unexpected bit reduction opcode");
22982 Opnds.push_back(Op.getOperand(0));
22983 Opnds.push_back(Op.getOperand(1));
22984
22985 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22987 // BFS traverse all BinOp operands.
22988 if (I->getOpcode() == unsigned(BinOp)) {
22989 Opnds.push_back(I->getOperand(0));
22990 Opnds.push_back(I->getOperand(1));
22991 // Re-evaluate the number of nodes to be traversed.
22992 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22993 continue;
22994 }
22995
22996 // Quit if a non-EXTRACT_VECTOR_ELT
22997 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22998 return false;
22999
23000 // Quit if without a constant index.
23001 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23002 if (!Idx)
23003 return false;
23004
23005 SDValue Src = I->getOperand(0);
23006 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23007 if (M == SrcOpMap.end()) {
23008 VT = Src.getValueType();
23009 // Quit if not the same type.
23010 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23011 return false;
23012 unsigned NumElts = VT.getVectorNumElements();
23013 APInt EltCount = APInt::getZero(NumElts);
23014 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23015 SrcOps.push_back(Src);
23016 }
23017
23018 // Quit if element already used.
23019 unsigned CIdx = Idx->getZExtValue();
23020 if (M->second[CIdx])
23021 return false;
23022 M->second.setBit(CIdx);
23023 }
23024
23025 if (SrcMask) {
23026 // Collect the source partial masks.
23027 for (SDValue &SrcOp : SrcOps)
23028 SrcMask->push_back(SrcOpMap[SrcOp]);
23029 } else {
23030 // Quit if not all elements are used.
23031 for (const auto &I : SrcOpMap)
23032 if (!I.second.isAllOnes())
23033 return false;
23034 }
23035
23036 return true;
23037}
23038
23039// Helper function for comparing all bits of two vectors.
23041 ISD::CondCode CC, const APInt &OriginalMask,
23042 const X86Subtarget &Subtarget,
23043 SelectionDAG &DAG, X86::CondCode &X86CC) {
23044 EVT VT = LHS.getValueType();
23045 unsigned ScalarSize = VT.getScalarSizeInBits();
23046 if (OriginalMask.getBitWidth() != ScalarSize) {
23047 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23048 return SDValue();
23049 }
23050
23051 // Quit if not convertable to legal scalar or 128/256-bit vector.
23052 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
23053 return SDValue();
23054
23055 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23056 if (VT.isFloatingPoint())
23057 return SDValue();
23058
23059 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23060 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23061
23062 APInt Mask = OriginalMask;
23063
23064 auto MaskBits = [&](SDValue Src) {
23065 if (Mask.isAllOnes())
23066 return Src;
23067 EVT SrcVT = Src.getValueType();
23068 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23069 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23070 };
23071
23072 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23073 if (VT.getSizeInBits() < 128) {
23074 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23075 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23076 if (IntVT != MVT::i64)
23077 return SDValue();
23078 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23079 MVT::i32, MVT::i32);
23080 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23081 MVT::i32, MVT::i32);
23082 SDValue Lo =
23083 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23084 SDValue Hi =
23085 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23086 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23087 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23088 DAG.getConstant(0, DL, MVT::i32));
23089 }
23090 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23091 DAG.getBitcast(IntVT, MaskBits(LHS)),
23092 DAG.getBitcast(IntVT, MaskBits(RHS)));
23093 }
23094
23095 // Without PTEST, a masked v2i64 or-reduction is not faster than
23096 // scalarization.
23097 bool UseKORTEST = Subtarget.useAVX512Regs();
23098 bool UsePTEST = Subtarget.hasSSE41();
23099 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23100 return SDValue();
23101
23102 // Split down to 128/256/512-bit vector.
23103 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23104
23105 // If the input vector has vector elements wider than the target test size,
23106 // then cast to <X x i64> so it will safely split.
23107 if (ScalarSize > TestSize) {
23108 if (!Mask.isAllOnes())
23109 return SDValue();
23110 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23111 LHS = DAG.getBitcast(VT, LHS);
23112 RHS = DAG.getBitcast(VT, RHS);
23113 Mask = APInt::getAllOnes(64);
23114 }
23115
23116 if (VT.getSizeInBits() > TestSize) {
23117 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23118 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23119 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23120 while (VT.getSizeInBits() > TestSize) {
23121 auto Split = DAG.SplitVector(LHS, DL);
23122 VT = Split.first.getValueType();
23123 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23124 }
23125 RHS = DAG.getAllOnesConstant(DL, VT);
23126 } else if (!UsePTEST && !KnownRHS.isZero()) {
23127 // MOVMSK Special Case:
23128 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23129 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23130 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23131 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23132 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23133 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23134 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23135 V = DAG.getSExtOrTrunc(V, DL, VT);
23136 while (VT.getSizeInBits() > TestSize) {
23137 auto Split = DAG.SplitVector(V, DL);
23138 VT = Split.first.getValueType();
23139 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23140 }
23141 V = DAG.getNOT(DL, V, VT);
23142 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23143 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23144 DAG.getConstant(0, DL, MVT::i32));
23145 } else {
23146 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23147 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23148 while (VT.getSizeInBits() > TestSize) {
23149 auto Split = DAG.SplitVector(V, DL);
23150 VT = Split.first.getValueType();
23151 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23152 }
23153 LHS = V;
23154 RHS = DAG.getConstant(0, DL, VT);
23155 }
23156 }
23157
23158 if (UseKORTEST && VT.is512BitVector()) {
23159 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23160 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23161 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23162 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23163 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23164 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23165 }
23166
23167 if (UsePTEST) {
23168 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23169 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23170 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23171 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23172 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23173 }
23174
23175 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23176 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23177 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23178 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23179 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23180 V = DAG.getNOT(DL, V, MaskVT);
23181 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23182 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23183 DAG.getConstant(0, DL, MVT::i32));
23184}
23185
23186// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23187// to CMP(MOVMSK(PCMPEQB(X,Y))).
23189 ISD::CondCode CC, const SDLoc &DL,
23190 const X86Subtarget &Subtarget,
23191 SelectionDAG &DAG,
23192 X86::CondCode &X86CC) {
23193 SDValue Op = OrigLHS;
23194
23195 bool CmpNull;
23196 APInt Mask;
23197 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23198 CmpNull = isNullConstant(OrigRHS);
23199 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23200 return SDValue();
23201
23202 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23203 return SDValue();
23204
23205 // Check whether we're masking/truncating an OR-reduction result, in which
23206 // case track the masked bits.
23207 // TODO: Add CmpAllOnes support.
23208 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23209 if (CmpNull) {
23210 switch (Op.getOpcode()) {
23211 case ISD::TRUNCATE: {
23212 SDValue Src = Op.getOperand(0);
23213 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23214 Op.getScalarValueSizeInBits());
23215 Op = Src;
23216 break;
23217 }
23218 case ISD::AND: {
23219 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23220 Mask = Cst->getAPIntValue();
23221 Op = Op.getOperand(0);
23222 }
23223 break;
23224 }
23225 }
23226 }
23227 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23228 CC = ISD::SETEQ;
23229 CmpNull = true;
23230 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23231 } else {
23232 return SDValue();
23233 }
23234
23235 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23236
23237 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23238 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23240 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23241 EVT VT = VecIns[0].getValueType();
23242 assert(llvm::all_of(VecIns,
23243 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23244 "Reduction source vector mismatch");
23245
23246 // Quit if not splittable to scalar/128/256/512-bit vector.
23247 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
23248 return SDValue();
23249
23250 // If more than one full vector is evaluated, AND/OR them first before
23251 // PTEST.
23252 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23253 Slot += 2, e += 1) {
23254 // Each iteration will AND/OR 2 nodes and append the result until there is
23255 // only 1 node left, i.e. the final value of all vectors.
23256 SDValue LHS = VecIns[Slot];
23257 SDValue RHS = VecIns[Slot + 1];
23258 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23259 }
23260
23261 return LowerVectorAllEqual(DL, VecIns.back(),
23262 CmpNull ? DAG.getConstant(0, DL, VT)
23263 : DAG.getAllOnesConstant(DL, VT),
23264 CC, Mask, Subtarget, DAG, X86CC);
23265 }
23266
23267 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23268 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23269 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23270 ISD::NodeType BinOp;
23271 if (SDValue Match =
23272 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23273 EVT MatchVT = Match.getValueType();
23274 return LowerVectorAllEqual(DL, Match,
23275 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23276 : DAG.getAllOnesConstant(DL, MatchVT),
23277 CC, Mask, Subtarget, DAG, X86CC);
23278 }
23279 }
23280
23281 if (Mask.isAllOnes()) {
23282 assert(!Op.getValueType().isVector() &&
23283 "Illegal vector type for reduction pattern");
23285 if (Src.getValueType().isFixedLengthVector() &&
23286 Src.getValueType().getScalarType() == MVT::i1) {
23287 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23288 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23289 if (Src.getOpcode() == ISD::SETCC) {
23290 SDValue LHS = Src.getOperand(0);
23291 SDValue RHS = Src.getOperand(1);
23292 EVT LHSVT = LHS.getValueType();
23293 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23294 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23295 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
23296 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23297 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23298 X86CC);
23299 }
23300 }
23301 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23302 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23303 // Peek through truncation, mask the LSB and compare against zero/LSB.
23304 if (Src.getOpcode() == ISD::TRUNCATE) {
23305 SDValue Inner = Src.getOperand(0);
23306 EVT InnerVT = Inner.getValueType();
23307 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
23308 unsigned BW = InnerVT.getScalarSizeInBits();
23309 APInt SrcMask = APInt(BW, 1);
23310 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23311 return LowerVectorAllEqual(DL, Inner,
23312 DAG.getConstant(Cmp, DL, InnerVT), CC,
23313 SrcMask, Subtarget, DAG, X86CC);
23314 }
23315 }
23316 }
23317 }
23318
23319 return SDValue();
23320}
23321
23322/// return true if \c Op has a use that doesn't just read flags.
23324 for (SDUse &Use : Op->uses()) {
23325 SDNode *User = Use.getUser();
23326 unsigned UOpNo = Use.getOperandNo();
23327 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23328 // Look past truncate.
23329 UOpNo = User->use_begin()->getOperandNo();
23330 User = User->use_begin()->getUser();
23331 }
23332
23333 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23334 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23335 return true;
23336 }
23337 return false;
23338}
23339
23340// Transform to an x86-specific ALU node with flags if there is a chance of
23341// using an RMW op or only the flags are used. Otherwise, leave
23342// the node alone and emit a 'cmp' or 'test' instruction.
23344 for (SDNode *U : Op->users())
23345 if (U->getOpcode() != ISD::CopyToReg &&
23346 U->getOpcode() != ISD::SETCC &&
23347 U->getOpcode() != ISD::STORE)
23348 return false;
23349
23350 return true;
23351}
23352
23353/// Emit nodes that will be selected as "test Op0,Op0", or something
23354/// equivalent.
23356 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23357 // CF and OF aren't always set the way we want. Determine which
23358 // of these we need.
23359 bool NeedCF = false;
23360 bool NeedOF = false;
23361 switch (X86CC) {
23362 default: break;
23363 case X86::COND_A: case X86::COND_AE:
23364 case X86::COND_B: case X86::COND_BE:
23365 NeedCF = true;
23366 break;
23367 case X86::COND_G: case X86::COND_GE:
23368 case X86::COND_L: case X86::COND_LE:
23369 case X86::COND_O: case X86::COND_NO: {
23370 // Check if we really need to set the
23371 // Overflow flag. If NoSignedWrap is present
23372 // that is not actually needed.
23373 switch (Op->getOpcode()) {
23374 case ISD::ADD:
23375 case ISD::SUB:
23376 case ISD::MUL:
23377 case ISD::SHL:
23378 if (Op.getNode()->getFlags().hasNoSignedWrap())
23379 break;
23380 [[fallthrough]];
23381 default:
23382 NeedOF = true;
23383 break;
23384 }
23385 break;
23386 }
23387 }
23388 // See if we can use the EFLAGS value from the operand instead of
23389 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23390 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23391 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23392 // Emit a CMP with 0, which is the TEST pattern.
23393 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23394 DAG.getConstant(0, dl, Op.getValueType()));
23395 }
23396 unsigned Opcode = 0;
23397 unsigned NumOperands = 0;
23398
23399 SDValue ArithOp = Op;
23400
23401 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23402 // which may be the result of a CAST. We use the variable 'Op', which is the
23403 // non-casted variable when we check for possible users.
23404 switch (ArithOp.getOpcode()) {
23405 case ISD::AND:
23406 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23407 // because a TEST instruction will be better.
23408 if (!hasNonFlagsUse(Op))
23409 break;
23410
23411 [[fallthrough]];
23412 case ISD::ADD:
23413 case ISD::SUB:
23414 case ISD::OR:
23415 case ISD::XOR:
23417 break;
23418
23419 // Otherwise use a regular EFLAGS-setting instruction.
23420 switch (ArithOp.getOpcode()) {
23421 // clang-format off
23422 default: llvm_unreachable("unexpected operator!");
23423 case ISD::ADD: Opcode = X86ISD::ADD; break;
23424 case ISD::SUB: Opcode = X86ISD::SUB; break;
23425 case ISD::XOR: Opcode = X86ISD::XOR; break;
23426 case ISD::AND: Opcode = X86ISD::AND; break;
23427 case ISD::OR: Opcode = X86ISD::OR; break;
23428 // clang-format on
23429 }
23430
23431 NumOperands = 2;
23432 break;
23433 case X86ISD::ADD:
23434 case X86ISD::SUB:
23435 case X86ISD::OR:
23436 case X86ISD::XOR:
23437 case X86ISD::AND:
23438 return SDValue(Op.getNode(), 1);
23439 case ISD::SSUBO:
23440 case ISD::USUBO: {
23441 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23442 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23443 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23444 Op->getOperand(1)).getValue(1);
23445 }
23446 default:
23447 break;
23448 }
23449
23450 if (Opcode == 0) {
23451 // Emit a CMP with 0, which is the TEST pattern.
23452 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23453 DAG.getConstant(0, dl, Op.getValueType()));
23454 }
23455 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23456 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23457
23458 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23459 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23460 return SDValue(New.getNode(), 1);
23461}
23462
23463/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23464/// equivalent.
23466 const SDLoc &dl, SelectionDAG &DAG,
23467 const X86Subtarget &Subtarget) {
23468 if (isNullConstant(Op1))
23469 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23470
23471 EVT CmpVT = Op0.getValueType();
23472
23473 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23474 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23475
23476 // Only promote the compare up to I32 if it is a 16 bit operation
23477 // with an immediate. 16 bit immediates are to be avoided unless the target
23478 // isn't slowed down by length changing prefixes, we're optimizing for
23479 // codesize or the comparison is with a folded load.
23480 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23481 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23483 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23484 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23485 // Don't do this if the immediate can fit in 8-bits.
23486 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23487 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23488 unsigned ExtendOp =
23490 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23491 // For equality comparisons try to use SIGN_EXTEND if the input was
23492 // truncate from something with enough sign bits.
23493 if (Op0.getOpcode() == ISD::TRUNCATE) {
23494 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23495 ExtendOp = ISD::SIGN_EXTEND;
23496 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23497 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23498 ExtendOp = ISD::SIGN_EXTEND;
23499 }
23500 }
23501
23502 CmpVT = MVT::i32;
23503 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23504 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23505 }
23506 }
23507
23508 // Try to shrink i64 compares if the input has enough zero bits.
23509 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23510 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23511 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23512 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23513 CmpVT = MVT::i32;
23514 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23515 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23516 }
23517
23518 // Try to shrink all i64 compares if the inputs are representable as signed
23519 // i32.
23520 if (CmpVT == MVT::i64 &&
23521 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23522 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23523 CmpVT = MVT::i32;
23524 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23525 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23526 }
23527
23528 // 0-x == y --> x+y == 0
23529 // 0-x != y --> x+y != 0
23530 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23531 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23532 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23533 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23534 return Add.getValue(1);
23535 }
23536
23537 // x == 0-y --> x+y == 0
23538 // x != 0-y --> x+y != 0
23539 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23540 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23541 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23542 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23543 return Add.getValue(1);
23544 }
23545
23546 // If we already have an XOR of the ops, use that to check for equality.
23547 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23548 unsigned X86Opc = X86ISD::SUB;
23549 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23550 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23551 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23552 X86Opc = X86ISD::XOR;
23553
23554 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23555 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23556 return CmpOp.getValue(1);
23557}
23558
23560 EVT VT) const {
23561 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
23562}
23563
23564bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23565 SDNode *N, SDValue, SDValue IntPow2) const {
23566 if (N->getOpcode() == ISD::FDIV)
23567 return true;
23568
23569 EVT FPVT = N->getValueType(0);
23570 EVT IntVT = IntPow2.getValueType();
23571
23572 // This indicates a non-free bitcast.
23573 // TODO: This is probably overly conservative as we will need to scale the
23574 // integer vector anyways for the int->fp cast.
23575 if (FPVT.isVector() &&
23576 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23577 return false;
23578
23579 return true;
23580}
23581
23582/// Check if replacement of SQRT with RSQRT should be disabled.
23583bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23584 EVT VT = Op.getValueType();
23585
23586 // We don't need to replace SQRT with RSQRT for half type.
23587 if (VT.getScalarType() == MVT::f16)
23588 return true;
23589
23590 // We never want to use both SQRT and RSQRT instructions for the same input.
23591 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23592 return false;
23593
23594 if (VT.isVector())
23595 return Subtarget.hasFastVectorFSQRT();
23596 return Subtarget.hasFastScalarFSQRT();
23597}
23598
23599/// The minimum architected relative accuracy is 2^-12. We need one
23600/// Newton-Raphson step to have a good float result (24 bits of precision).
23601SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23602 SelectionDAG &DAG, int Enabled,
23603 int &RefinementSteps,
23604 bool &UseOneConstNR,
23605 bool Reciprocal) const {
23606 SDLoc DL(Op);
23607 EVT VT = Op.getValueType();
23608
23609 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23610 // It is likely not profitable to do this for f64 because a double-precision
23611 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23612 // instructions: convert to single, rsqrtss, convert back to double, refine
23613 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23614 // along with FMA, this could be a throughput win.
23615 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23616 // after legalize types.
23617 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23618 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23619 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23620 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23621 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23622 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23623 RefinementSteps = 1;
23624
23625 UseOneConstNR = false;
23626 // There is no FSQRT for 512-bits, but there is RSQRT14.
23627 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23628 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23629 if (RefinementSteps == 0 && !Reciprocal)
23630 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23631 return Estimate;
23632 }
23633
23634 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23635 Subtarget.hasFP16()) {
23636 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23637 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23638 RefinementSteps = 0;
23639
23640 if (VT == MVT::f16) {
23642 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23643 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23644 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23645 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23646 }
23647
23648 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23649 }
23650 return SDValue();
23651}
23652
23653/// The minimum architected relative accuracy is 2^-12. We need one
23654/// Newton-Raphson step to have a good float result (24 bits of precision).
23655SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23656 int Enabled,
23657 int &RefinementSteps) const {
23658 SDLoc DL(Op);
23659 EVT VT = Op.getValueType();
23660
23661 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23662 // It is likely not profitable to do this for f64 because a double-precision
23663 // reciprocal estimate with refinement on x86 prior to FMA requires
23664 // 15 instructions: convert to single, rcpss, convert back to double, refine
23665 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23666 // along with FMA, this could be a throughput win.
23667
23668 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23669 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23670 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23671 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23672 // Enable estimate codegen with 1 refinement step for vector division.
23673 // Scalar division estimates are disabled because they break too much
23674 // real-world code. These defaults are intended to match GCC behavior.
23675 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23676 return SDValue();
23677
23678 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23679 RefinementSteps = 1;
23680
23681 // There is no FSQRT for 512-bits, but there is RCP14.
23682 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23683 return DAG.getNode(Opcode, DL, VT, Op);
23684 }
23685
23686 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23687 Subtarget.hasFP16()) {
23688 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23689 RefinementSteps = 0;
23690
23691 if (VT == MVT::f16) {
23693 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23694 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23695 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23696 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23697 }
23698
23699 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23700 }
23701 return SDValue();
23702}
23703
23704/// If we have at least two divisions that use the same divisor, convert to
23705/// multiplication by a reciprocal. This may need to be adjusted for a given
23706/// CPU if a division's cost is not at least twice the cost of a multiplication.
23707/// This is because we still need one division to calculate the reciprocal and
23708/// then we need two multiplies by that reciprocal as replacements for the
23709/// original divisions.
23710unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23711 return 2;
23712}
23713
23714SDValue
23715X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23716 SelectionDAG &DAG,
23717 SmallVectorImpl<SDNode *> &Created) const {
23719 if (isIntDivCheap(N->getValueType(0), Attr))
23720 return SDValue(N,0); // Lower SDIV as SDIV
23721
23722 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23723 "Unexpected divisor!");
23724
23725 // Only perform this transform if CMOV is supported otherwise the select
23726 // below will become a branch.
23727 if (!Subtarget.canUseCMOV())
23728 return SDValue();
23729
23730 // fold (sdiv X, pow2)
23731 EVT VT = N->getValueType(0);
23732 // FIXME: Support i8.
23733 if (VT != MVT::i16 && VT != MVT::i32 &&
23734 !(Subtarget.is64Bit() && VT == MVT::i64))
23735 return SDValue();
23736
23737 // If the divisor is 2 or -2, the default expansion is better.
23738 if (Divisor == 2 ||
23739 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23740 return SDValue();
23741
23742 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23743}
23744
23745/// Result of 'and' is compared against zero. Change to a BT node if possible.
23746/// Returns the BT node and the condition code needed to use it.
23748 SelectionDAG &DAG, X86::CondCode &X86CC) {
23749 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23750 SDValue Op0 = And.getOperand(0);
23751 SDValue Op1 = And.getOperand(1);
23752 if (Op0.getOpcode() == ISD::TRUNCATE)
23753 Op0 = Op0.getOperand(0);
23754 if (Op1.getOpcode() == ISD::TRUNCATE)
23755 Op1 = Op1.getOperand(0);
23756
23757 SDValue Src, BitNo;
23758 if (Op1.getOpcode() == ISD::SHL)
23759 std::swap(Op0, Op1);
23760 if (Op0.getOpcode() == ISD::SHL) {
23761 if (isOneConstant(Op0.getOperand(0))) {
23762 // If we looked past a truncate, check that it's only truncating away
23763 // known zeros.
23764 unsigned BitWidth = Op0.getValueSizeInBits();
23765 unsigned AndBitWidth = And.getValueSizeInBits();
23766 if (BitWidth > AndBitWidth) {
23767 KnownBits Known = DAG.computeKnownBits(Op0);
23768 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23769 return SDValue();
23770 }
23771 Src = Op1;
23772 BitNo = Op0.getOperand(1);
23773 }
23774 } else if (Op1.getOpcode() == ISD::Constant) {
23775 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23776 uint64_t AndRHSVal = AndRHS->getZExtValue();
23777 SDValue AndLHS = Op0;
23778
23779 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23780 Src = AndLHS.getOperand(0);
23781 BitNo = AndLHS.getOperand(1);
23782 } else {
23783 // Use BT if the immediate can't be encoded in a TEST instruction or we
23784 // are optimizing for size and the immedaite won't fit in a byte.
23785 bool OptForSize = DAG.shouldOptForSize();
23786 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23787 isPowerOf2_64(AndRHSVal)) {
23788 Src = AndLHS;
23789 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23790 Src.getValueType());
23791 }
23792 }
23793 }
23794
23795 // No patterns found, give up.
23796 if (!Src.getNode())
23797 return SDValue();
23798
23799 // Remove any bit flip.
23800 if (isBitwiseNot(Src)) {
23801 Src = Src.getOperand(0);
23802 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23803 }
23804
23805 // Attempt to create the X86ISD::BT node.
23806 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23807 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23808 return BT;
23809 }
23810
23811 return SDValue();
23812}
23813
23814// Check if pre-AVX condcode can be performed by a single FCMP op.
23815static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23816 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23817}
23818
23819/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23820/// CMPs.
23821static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23822 SDValue &Op1, bool &IsAlwaysSignaling) {
23823 unsigned SSECC;
23824 bool Swap = false;
23825
23826 // SSE Condition code mapping:
23827 // 0 - EQ
23828 // 1 - LT
23829 // 2 - LE
23830 // 3 - UNORD
23831 // 4 - NEQ
23832 // 5 - NLT
23833 // 6 - NLE
23834 // 7 - ORD
23835 switch (SetCCOpcode) {
23836 // clang-format off
23837 default: llvm_unreachable("Unexpected SETCC condition");
23838 case ISD::SETOEQ:
23839 case ISD::SETEQ: SSECC = 0; break;
23840 case ISD::SETOGT:
23841 case ISD::SETGT: Swap = true; [[fallthrough]];
23842 case ISD::SETLT:
23843 case ISD::SETOLT: SSECC = 1; break;
23844 case ISD::SETOGE:
23845 case ISD::SETGE: Swap = true; [[fallthrough]];
23846 case ISD::SETLE:
23847 case ISD::SETOLE: SSECC = 2; break;
23848 case ISD::SETUO: SSECC = 3; break;
23849 case ISD::SETUNE:
23850 case ISD::SETNE: SSECC = 4; break;
23851 case ISD::SETULE: Swap = true; [[fallthrough]];
23852 case ISD::SETUGE: SSECC = 5; break;
23853 case ISD::SETULT: Swap = true; [[fallthrough]];
23854 case ISD::SETUGT: SSECC = 6; break;
23855 case ISD::SETO: SSECC = 7; break;
23856 case ISD::SETUEQ: SSECC = 8; break;
23857 case ISD::SETONE: SSECC = 12; break;
23858 // clang-format on
23859 }
23860 if (Swap)
23861 std::swap(Op0, Op1);
23862
23863 switch (SetCCOpcode) {
23864 default:
23865 IsAlwaysSignaling = true;
23866 break;
23867 case ISD::SETEQ:
23868 case ISD::SETOEQ:
23869 case ISD::SETUEQ:
23870 case ISD::SETNE:
23871 case ISD::SETONE:
23872 case ISD::SETUNE:
23873 case ISD::SETO:
23874 case ISD::SETUO:
23875 IsAlwaysSignaling = false;
23876 break;
23877 }
23878
23879 return SSECC;
23880}
23881
23882/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23883/// concatenate the result back.
23885 SelectionDAG &DAG, const SDLoc &dl) {
23886 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23887 "Unsupported VTs!");
23888 SDValue CC = DAG.getCondCode(Cond);
23889
23890 // Extract the LHS Lo/Hi vectors
23891 SDValue LHS1, LHS2;
23892 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23893
23894 // Extract the RHS Lo/Hi vectors
23895 SDValue RHS1, RHS2;
23896 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23897
23898 // Issue the operation on the smaller types and concatenate the result back
23899 EVT LoVT, HiVT;
23900 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23901 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23902 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23903 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23904}
23905
23907 SelectionDAG &DAG) {
23908 SDValue Op0 = Op.getOperand(0);
23909 SDValue Op1 = Op.getOperand(1);
23910 SDValue CC = Op.getOperand(2);
23911 MVT VT = Op.getSimpleValueType();
23912 assert(VT.getVectorElementType() == MVT::i1 &&
23913 "Cannot set masked compare for this operation");
23914
23915 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23916
23917 // Prefer SETGT over SETLT.
23918 if (SetCCOpcode == ISD::SETLT) {
23919 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23920 std::swap(Op0, Op1);
23921 }
23922
23923 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23924}
23925
23926/// Given a buildvector constant, return a new vector constant with each element
23927/// incremented or decremented. If incrementing or decrementing would result in
23928/// unsigned overflow or underflow or this is not a simple vector constant,
23929/// return an empty value.
23931 bool NSW) {
23932 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23933 if (!BV || !V.getValueType().isSimple())
23934 return SDValue();
23935
23936 MVT VT = V.getSimpleValueType();
23937 MVT EltVT = VT.getVectorElementType();
23938 unsigned NumElts = VT.getVectorNumElements();
23940 SDLoc DL(V);
23941 for (unsigned i = 0; i < NumElts; ++i) {
23942 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23943 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23944 return SDValue();
23945
23946 // Avoid overflow/underflow.
23947 const APInt &EltC = Elt->getAPIntValue();
23948 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23949 return SDValue();
23950 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23951 (!IsInc && EltC.isMinSignedValue())))
23952 return SDValue();
23953
23954 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23955 }
23956
23957 return DAG.getBuildVector(VT, DL, NewVecC);
23958}
23959
23960/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23961/// Op0 u<= Op1:
23962/// t = psubus Op0, Op1
23963/// pcmpeq t, <0..0>
23965 ISD::CondCode Cond, const SDLoc &dl,
23966 const X86Subtarget &Subtarget,
23967 SelectionDAG &DAG) {
23968 if (!Subtarget.hasSSE2())
23969 return SDValue();
23970
23971 MVT VET = VT.getVectorElementType();
23972 if (VET != MVT::i8 && VET != MVT::i16)
23973 return SDValue();
23974
23975 switch (Cond) {
23976 default:
23977 return SDValue();
23978 case ISD::SETULT: {
23979 // If the comparison is against a constant we can turn this into a
23980 // setule. With psubus, setule does not require a swap. This is
23981 // beneficial because the constant in the register is no longer
23982 // destructed as the destination so it can be hoisted out of a loop.
23983 // Only do this pre-AVX since vpcmp* is no longer destructive.
23984 if (Subtarget.hasAVX())
23985 return SDValue();
23986 SDValue ULEOp1 =
23987 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23988 if (!ULEOp1)
23989 return SDValue();
23990 Op1 = ULEOp1;
23991 break;
23992 }
23993 case ISD::SETUGT: {
23994 // If the comparison is against a constant, we can turn this into a setuge.
23995 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23996 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23997 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23998 SDValue UGEOp1 =
23999 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24000 if (!UGEOp1)
24001 return SDValue();
24002 Op1 = Op0;
24003 Op0 = UGEOp1;
24004 break;
24005 }
24006 // Psubus is better than flip-sign because it requires no inversion.
24007 case ISD::SETUGE:
24008 std::swap(Op0, Op1);
24009 break;
24010 case ISD::SETULE:
24011 break;
24012 }
24013
24014 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24015 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24016 DAG.getConstant(0, dl, VT));
24017}
24018
24019static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24020 SelectionDAG &DAG) {
24021 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24022 Op.getOpcode() == ISD::STRICT_FSETCCS;
24023 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24024 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24025 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24026 MVT VT = Op->getSimpleValueType(0);
24027 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
24028 MVT OpVT = Op0.getSimpleValueType();
24029 SDLoc dl(Op);
24030
24031 if (OpVT.isFloatingPoint()) {
24032 MVT EltVT = OpVT.getVectorElementType();
24033 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24034 EltVT == MVT::f64);
24035
24036 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24037 if (isSoftF16(EltVT, Subtarget)) {
24038 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24039 return SDValue();
24040
24041 // Break 256-bit FP vector compare into smaller ones.
24042 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24043 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24044
24045 // Break 512-bit FP vector compare into smaller ones.
24046 if (OpVT.is512BitVector())
24047 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24048
24049 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24050 if (IsStrict) {
24051 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24052 {Chain, Op0});
24053 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24054 {Chain, Op1});
24055 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24056 {Chain, Op0, Op1, CC});
24057 }
24058 MVT DVT = VT.getVectorElementType() == MVT::i16
24059 ? VT.changeVectorElementType(MVT::i32)
24060 : VT;
24061 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24062 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24063 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24064 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24065 }
24066
24067 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24068
24069 // If we have a strict compare with a vXi1 result and the input is 128/256
24070 // bits we can't use a masked compare unless we have VLX. If we use a wider
24071 // compare like we do for non-strict, we might trigger spurious exceptions
24072 // from the upper elements. Instead emit a AVX compare and convert to mask.
24073 unsigned Opc;
24074 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24075 (!IsStrict || Subtarget.hasVLX() ||
24077#ifndef NDEBUG
24078 unsigned Num = VT.getVectorNumElements();
24079 assert(Num <= 16 ||
24080 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24081#endif
24082 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24083 } else {
24084 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24085 // The SSE/AVX packed FP comparison nodes are defined with a
24086 // floating-point vector result that matches the operand type. This allows
24087 // them to work with an SSE1 target (integer vector types are not legal).
24088 VT = Op0.getSimpleValueType();
24089 }
24090
24091 SDValue Cmp;
24092 bool IsAlwaysSignaling;
24093 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24094 if (!Subtarget.hasAVX()) {
24095 // TODO: We could use following steps to handle a quiet compare with
24096 // signaling encodings.
24097 // 1. Get ordered masks from a quiet ISD::SETO
24098 // 2. Use the masks to mask potential unordered elements in operand A, B
24099 // 3. Get the compare results of masked A, B
24100 // 4. Calculating final result using the mask and result from 3
24101 // But currently, we just fall back to scalar operations.
24102 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24103 return SDValue();
24104
24105 // Insert an extra signaling instruction to raise exception.
24106 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24107 SDValue SignalCmp = DAG.getNode(
24108 Opc, dl, {VT, MVT::Other},
24109 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24110 // FIXME: It seems we need to update the flags of all new strict nodes.
24111 // Otherwise, mayRaiseFPException in MI will return false due to
24112 // NoFPExcept = false by default. However, I didn't find it in other
24113 // patches.
24114 SignalCmp->setFlags(Op->getFlags());
24115 Chain = SignalCmp.getValue(1);
24116 }
24117
24118 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24119 // emit two comparisons and a logic op to tie them together.
24120 if (!cheapX86FSETCC_SSE(Cond)) {
24121 // LLVM predicate is SETUEQ or SETONE.
24122 unsigned CC0, CC1;
24123 unsigned CombineOpc;
24124 if (Cond == ISD::SETUEQ) {
24125 CC0 = 3; // UNORD
24126 CC1 = 0; // EQ
24127 CombineOpc = X86ISD::FOR;
24128 } else {
24130 CC0 = 7; // ORD
24131 CC1 = 4; // NEQ
24132 CombineOpc = X86ISD::FAND;
24133 }
24134
24135 SDValue Cmp0, Cmp1;
24136 if (IsStrict) {
24137 Cmp0 = DAG.getNode(
24138 Opc, dl, {VT, MVT::Other},
24139 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24140 Cmp1 = DAG.getNode(
24141 Opc, dl, {VT, MVT::Other},
24142 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24143 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24144 Cmp1.getValue(1));
24145 } else {
24146 Cmp0 = DAG.getNode(
24147 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24148 Cmp1 = DAG.getNode(
24149 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24150 }
24151 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24152 } else {
24153 if (IsStrict) {
24154 Cmp = DAG.getNode(
24155 Opc, dl, {VT, MVT::Other},
24156 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24157 Chain = Cmp.getValue(1);
24158 } else
24159 Cmp = DAG.getNode(
24160 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24161 }
24162 } else {
24163 // Handle all other FP comparisons here.
24164 if (IsStrict) {
24165 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24166 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24167 Cmp = DAG.getNode(
24168 Opc, dl, {VT, MVT::Other},
24169 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24170 Chain = Cmp.getValue(1);
24171 } else
24172 Cmp = DAG.getNode(
24173 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24174 }
24175
24176 if (VT.getFixedSizeInBits() >
24177 Op.getSimpleValueType().getFixedSizeInBits()) {
24178 // We emitted a compare with an XMM/YMM result. Finish converting to a
24179 // mask register using a vptestm.
24181 Cmp = DAG.getBitcast(CastVT, Cmp);
24182 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24183 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24184 } else {
24185 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24186 // the result type of SETCC. The bitcast is expected to be optimized
24187 // away during combining/isel.
24188 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24189 }
24190
24191 if (IsStrict)
24192 return DAG.getMergeValues({Cmp, Chain}, dl);
24193
24194 return Cmp;
24195 }
24196
24197 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24198
24199 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24200 assert(VTOp0 == Op1.getSimpleValueType() &&
24201 "Expected operands with same type!");
24203 "Invalid number of packed elements for source and destination!");
24204
24205 // The non-AVX512 code below works under the assumption that source and
24206 // destination types are the same.
24207 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24208 "Value types for source and destination must be the same!");
24209
24210 // The result is boolean, but operands are int/float
24211 if (VT.getVectorElementType() == MVT::i1) {
24212 // In AVX-512 architecture setcc returns mask with i1 elements,
24213 // But there is no compare instruction for i8 and i16 elements in KNL.
24214 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24215 "Unexpected operand type");
24216 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24217 }
24218
24219 // Lower using XOP integer comparisons.
24220 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24221 // Translate compare code to XOP PCOM compare mode.
24222 unsigned CmpMode = 0;
24223 switch (Cond) {
24224 // clang-format off
24225 default: llvm_unreachable("Unexpected SETCC condition");
24226 case ISD::SETULT:
24227 case ISD::SETLT: CmpMode = 0x00; break;
24228 case ISD::SETULE:
24229 case ISD::SETLE: CmpMode = 0x01; break;
24230 case ISD::SETUGT:
24231 case ISD::SETGT: CmpMode = 0x02; break;
24232 case ISD::SETUGE:
24233 case ISD::SETGE: CmpMode = 0x03; break;
24234 case ISD::SETEQ: CmpMode = 0x04; break;
24235 case ISD::SETNE: CmpMode = 0x05; break;
24236 // clang-format on
24237 }
24238
24239 // Are we comparing unsigned or signed integers?
24240 unsigned Opc =
24242
24243 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24244 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24245 }
24246
24247 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24248 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24250 SDValue BC0 = peekThroughBitcasts(Op0);
24251 if (BC0.getOpcode() == ISD::AND &&
24253 /*AllowUndefs=*/false)) {
24254 Cond = ISD::SETEQ;
24255 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24256 }
24257 }
24258
24259 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24260 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24261 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24263 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24264 unsigned BitWidth = VT.getScalarSizeInBits();
24265 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24266
24267 SDValue Result = Op0.getOperand(0);
24268 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24269 DAG.getConstant(ShiftAmt, dl, VT));
24270 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24271 DAG.getConstant(BitWidth - 1, dl, VT));
24272 return Result;
24273 }
24274 }
24275
24276 // Break 256-bit integer vector compare into smaller ones.
24277 if (VT.is256BitVector() && !Subtarget.hasInt256())
24278 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24279
24280 // Break 512-bit integer vector compare into smaller ones.
24281 // TODO: Try harder to use VPCMPx + VPMOV2x?
24282 if (VT.is512BitVector())
24283 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24284
24285 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24286 // not-of-PCMPEQ:
24287 // X != INT_MIN --> X >s INT_MIN
24288 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24289 // +X != 0 --> +X >s 0
24290 APInt ConstValue;
24291 if (Cond == ISD::SETNE &&
24292 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24293 if (ConstValue.isMinSignedValue())
24294 Cond = ISD::SETGT;
24295 else if (ConstValue.isMaxSignedValue())
24296 Cond = ISD::SETLT;
24297 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24298 Cond = ISD::SETGT;
24299 }
24300
24301 // If both operands are known non-negative, then an unsigned compare is the
24302 // same as a signed compare and there's no need to flip signbits.
24303 // TODO: We could check for more general simplifications here since we're
24304 // computing known bits.
24305 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24306 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24307
24308 // Special case: Use min/max operations for unsigned compares.
24309 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24311 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24312 TLI.isOperationLegal(ISD::UMIN, VT)) {
24313 // If we have a constant operand, increment/decrement it and change the
24314 // condition to avoid an invert.
24315 if (Cond == ISD::SETUGT) {
24316 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24317 if (SDValue UGTOp1 =
24318 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24319 Op1 = UGTOp1;
24320 Cond = ISD::SETUGE;
24321 }
24322 }
24323 if (Cond == ISD::SETULT) {
24324 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24325 if (SDValue ULTOp1 =
24326 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24327 Op1 = ULTOp1;
24328 Cond = ISD::SETULE;
24329 }
24330 }
24331 bool Invert = false;
24332 unsigned Opc;
24333 switch (Cond) {
24334 // clang-format off
24335 default: llvm_unreachable("Unexpected condition code");
24336 case ISD::SETUGT: Invert = true; [[fallthrough]];
24337 case ISD::SETULE: Opc = ISD::UMIN; break;
24338 case ISD::SETULT: Invert = true; [[fallthrough]];
24339 case ISD::SETUGE: Opc = ISD::UMAX; break;
24340 // clang-format on
24341 }
24342
24343 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24344 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24345
24346 // If the logical-not of the result is required, perform that now.
24347 if (Invert)
24348 Result = DAG.getNOT(dl, Result, VT);
24349
24350 return Result;
24351 }
24352
24353 // Try to use SUBUS and PCMPEQ.
24354 if (FlipSigns)
24355 if (SDValue V =
24356 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24357 return V;
24358
24359 // We are handling one of the integer comparisons here. Since SSE only has
24360 // GT and EQ comparisons for integer, swapping operands and multiple
24361 // operations may be required for some comparisons.
24362 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24364 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24366 bool Invert = Cond == ISD::SETNE ||
24368
24369 if (Swap)
24370 std::swap(Op0, Op1);
24371
24372 // Check that the operation in question is available (most are plain SSE2,
24373 // but PCMPGTQ and PCMPEQQ have different requirements).
24374 if (VT == MVT::v2i64) {
24375 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24376 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24377
24378 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24379 // the odd elements over the even elements.
24380 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24381 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24382 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24383
24384 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24385 static const int MaskHi[] = { 1, 1, 3, 3 };
24386 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24387
24388 return DAG.getBitcast(VT, Result);
24389 }
24390
24391 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24392 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24393 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24394
24395 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24396 static const int MaskHi[] = { 1, 1, 3, 3 };
24397 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24398
24399 return DAG.getBitcast(VT, Result);
24400 }
24401
24402 // If the i64 elements are sign-extended enough to be representable as i32
24403 // then we can compare the lower i32 bits and splat.
24404 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24405 DAG.ComputeNumSignBits(Op1) > 32) {
24406 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24407 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24408
24409 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24410 static const int MaskLo[] = {0, 0, 2, 2};
24411 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24412
24413 return DAG.getBitcast(VT, Result);
24414 }
24415
24416 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24417 // bits of the inputs before performing those operations. The lower
24418 // compare is always unsigned.
24419 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24420 : 0x0000000080000000ULL,
24421 dl, MVT::v2i64);
24422
24423 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24424 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24425
24426 // Cast everything to the right type.
24427 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24428 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24429
24430 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24431 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24432 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24433
24434 // Create masks for only the low parts/high parts of the 64 bit integers.
24435 static const int MaskHi[] = { 1, 1, 3, 3 };
24436 static const int MaskLo[] = { 0, 0, 2, 2 };
24437 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24438 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24439 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24440
24441 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24442 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24443
24444 if (Invert)
24445 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24446
24447 return DAG.getBitcast(VT, Result);
24448 }
24449
24450 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24451 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24452 // pcmpeqd + pshufd + pand.
24453 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24454
24455 // First cast everything to the right type.
24456 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24457 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24458
24459 // Do the compare.
24460 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24461
24462 // Make sure the lower and upper halves are both all-ones.
24463 static const int Mask[] = { 1, 0, 3, 2 };
24464 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24465 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24466
24467 if (Invert)
24468 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24469
24470 return DAG.getBitcast(VT, Result);
24471 }
24472 }
24473
24474 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24475 // bits of the inputs before performing those operations.
24476 if (FlipSigns) {
24477 MVT EltVT = VT.getVectorElementType();
24479 VT);
24480 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24481 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24482 }
24483
24484 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24485
24486 // If the logical-not of the result is required, perform that now.
24487 if (Invert)
24488 Result = DAG.getNOT(dl, Result, VT);
24489
24490 return Result;
24491}
24492
24493// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24495 const SDLoc &dl, SelectionDAG &DAG,
24496 const X86Subtarget &Subtarget,
24497 SDValue &X86CC) {
24498 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24499
24500 // Must be a bitcast from vXi1.
24501 if (Op0.getOpcode() != ISD::BITCAST)
24502 return SDValue();
24503
24504 Op0 = Op0.getOperand(0);
24505 MVT VT = Op0.getSimpleValueType();
24506 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24507 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24508 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24509 return SDValue();
24510
24511 X86::CondCode X86Cond;
24512 if (isNullConstant(Op1)) {
24513 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24514 } else if (isAllOnesConstant(Op1)) {
24515 // C flag is set for all ones.
24516 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24517 } else
24518 return SDValue();
24519
24520 // If the input is an AND, we can combine it's operands into the KTEST.
24521 bool KTestable = false;
24522 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24523 KTestable = true;
24524 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24525 KTestable = true;
24526 if (!isNullConstant(Op1))
24527 KTestable = false;
24528 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24529 SDValue LHS = Op0.getOperand(0);
24530 SDValue RHS = Op0.getOperand(1);
24531 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24532 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24533 }
24534
24535 // If the input is an OR, we can combine it's operands into the KORTEST.
24536 SDValue LHS = Op0;
24537 SDValue RHS = Op0;
24538 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24539 LHS = Op0.getOperand(0);
24540 RHS = Op0.getOperand(1);
24541 }
24542
24543 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24544 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24545}
24546
24547/// Emit flags for the given setcc condition and operands. Also returns the
24548/// corresponding X86 condition code constant in X86CC.
24549SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24550 ISD::CondCode CC, const SDLoc &dl,
24551 SelectionDAG &DAG,
24552 SDValue &X86CC) const {
24553 // Equality Combines.
24554 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24555 X86::CondCode X86CondCode;
24556
24557 // Optimize to BT if possible.
24558 // Lower (X & (1 << N)) == 0 to BT(X, N).
24559 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24560 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24561 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24562 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24563 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24564 return BT;
24565 }
24566 }
24567
24568 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24569 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24570 X86CondCode)) {
24571 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24572 return CmpZ;
24573 }
24574
24575 // Try to lower using KORTEST or KTEST.
24576 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24577 return Test;
24578
24579 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24580 // of these.
24581 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24582 // If the input is a setcc, then reuse the input setcc or use a new one
24583 // with the inverted condition.
24584 if (Op0.getOpcode() == X86ISD::SETCC) {
24585 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24586
24587 X86CC = Op0.getOperand(0);
24588 if (Invert) {
24589 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24590 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24591 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24592 }
24593
24594 return Op0.getOperand(1);
24595 }
24596 }
24597
24598 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24599 // overflow.
24600 if (isMinSignedConstant(Op1)) {
24601 EVT VT = Op0.getValueType();
24602 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24603 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24605 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24606 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24607 DAG.getConstant(0, dl, VT), Op0);
24608 return SDValue(Neg.getNode(), 1);
24609 }
24610 }
24611
24612 // Try to use the carry flag from the add in place of an separate CMP for:
24613 // (seteq (add X, -1), -1). Similar for setne.
24614 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24615 Op0.getOperand(1) == Op1) {
24616 if (isProfitableToUseFlagOp(Op0)) {
24617 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24618
24619 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24620 Op0.getOperand(1));
24621 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24622 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24623 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24624 return SDValue(New.getNode(), 1);
24625 }
24626 }
24627 }
24628
24630 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24631 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24632
24633 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24634 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24635 return EFLAGS;
24636}
24637
24638SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24639
24640 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24641 Op.getOpcode() == ISD::STRICT_FSETCCS;
24642 MVT VT = Op->getSimpleValueType(0);
24643
24644 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24645
24646 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24647 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24648 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24649 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24650 SDLoc dl(Op);
24651 ISD::CondCode CC =
24652 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24653
24654 if (isSoftF16(Op0.getValueType(), Subtarget))
24655 return SDValue();
24656
24657 // Handle f128 first, since one possible outcome is a normal integer
24658 // comparison which gets handled by emitFlagsForSetcc.
24659 if (Op0.getValueType() == MVT::f128) {
24660 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24661 Op.getOpcode() == ISD::STRICT_FSETCCS);
24662
24663 // If softenSetCCOperands returned a scalar, use it.
24664 if (!Op1.getNode()) {
24665 assert(Op0.getValueType() == Op.getValueType() &&
24666 "Unexpected setcc expansion!");
24667 if (IsStrict)
24668 return DAG.getMergeValues({Op0, Chain}, dl);
24669 return Op0;
24670 }
24671 }
24672
24673 if (Op0.getSimpleValueType().isInteger()) {
24674 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24675 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24676 // this may translate to less uops depending on uarch implementation. The
24677 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24678 // canonicalize to that CondCode.
24679 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24680 // encoding size - so it must either already be a i8 or i32 immediate, or it
24681 // shrinks down to that. We don't do this for any i64's to avoid additional
24682 // constant materializations.
24683 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24684 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24685 const APInt &Op1Val = Op1C->getAPIntValue();
24686 if (!Op1Val.isZero()) {
24687 // Ensure the constant+1 doesn't overflow.
24688 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24689 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24690 APInt Op1ValPlusOne = Op1Val + 1;
24691 if (Op1ValPlusOne.isSignedIntN(32) &&
24692 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24693 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24696 }
24697 }
24698 }
24699 }
24700
24701 SDValue X86CC;
24702 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24703 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24704 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24705 }
24706
24707 if (Subtarget.hasAVX10_2()) {
24708 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24709 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24710 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24711 if (Op0.getSimpleValueType() != MVT::f80) {
24712 SDValue Res = getSETCC(
24713 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24714 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24715 }
24716 }
24717 }
24718 // Handle floating point.
24719 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24720 if (CondCode == X86::COND_INVALID)
24721 return SDValue();
24722
24723 SDValue EFLAGS;
24724 if (IsStrict) {
24725 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24726 EFLAGS =
24728 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24729 Chain = EFLAGS.getValue(1);
24730 } else {
24731 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24732 }
24733
24734 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24735 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24736 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24737}
24738
24739SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24740 SDValue LHS = Op.getOperand(0);
24741 SDValue RHS = Op.getOperand(1);
24742 SDValue Carry = Op.getOperand(2);
24743 SDValue Cond = Op.getOperand(3);
24744 SDLoc DL(Op);
24745
24746 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24747 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24748
24749 // Recreate the carry if needed.
24750 EVT CarryVT = Carry.getValueType();
24751 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24752 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24753
24754 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24755 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24756 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24757}
24758
24759// This function returns three things: the arithmetic computation itself
24760// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24761// flag and the condition code define the case in which the arithmetic
24762// computation overflows.
24763static std::pair<SDValue, SDValue>
24765 assert(Op.getResNo() == 0 && "Unexpected result number!");
24766 SDValue Value, Overflow;
24767 SDValue LHS = Op.getOperand(0);
24768 SDValue RHS = Op.getOperand(1);
24769 unsigned BaseOp = 0;
24770 SDLoc DL(Op);
24771 switch (Op.getOpcode()) {
24772 default: llvm_unreachable("Unknown ovf instruction!");
24773 case ISD::SADDO:
24774 BaseOp = X86ISD::ADD;
24775 Cond = X86::COND_O;
24776 break;
24777 case ISD::UADDO:
24778 BaseOp = X86ISD::ADD;
24780 break;
24781 case ISD::SSUBO:
24782 BaseOp = X86ISD::SUB;
24783 Cond = X86::COND_O;
24784 break;
24785 case ISD::USUBO:
24786 BaseOp = X86ISD::SUB;
24787 Cond = X86::COND_B;
24788 break;
24789 case ISD::SMULO:
24790 BaseOp = X86ISD::SMUL;
24791 Cond = X86::COND_O;
24792 break;
24793 case ISD::UMULO:
24794 BaseOp = X86ISD::UMUL;
24795 Cond = X86::COND_O;
24796 break;
24797 }
24798
24799 if (BaseOp) {
24800 // Also sets EFLAGS.
24801 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24802 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24803 Overflow = Value.getValue(1);
24804 }
24805
24806 return std::make_pair(Value, Overflow);
24807}
24808
24810 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24811 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24812 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24813 // has only one use.
24814 SDLoc DL(Op);
24816 SDValue Value, Overflow;
24817 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24818
24819 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24820 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24821 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24822}
24823
24824/// Return true if opcode is a X86 logical comparison.
24826 unsigned Opc = Op.getOpcode();
24827 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24828 Opc == X86ISD::FCMP)
24829 return true;
24830 if (Op.getResNo() == 1 &&
24831 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24833 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24834 return true;
24835
24836 return false;
24837}
24838
24840 if (V.getOpcode() != ISD::TRUNCATE)
24841 return false;
24842
24843 SDValue VOp0 = V.getOperand(0);
24844 unsigned InBits = VOp0.getValueSizeInBits();
24845 unsigned Bits = V.getValueSizeInBits();
24846 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24847}
24848
24849// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24851 unsigned X86CC, const SDLoc &DL,
24852 SelectionDAG &DAG,
24853 const X86Subtarget &Subtarget) {
24854 EVT CmpVT = CmpVal.getValueType();
24855 EVT VT = LHS.getValueType();
24856 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24857 return SDValue();
24858
24859 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24860 isOneConstant(CmpVal.getOperand(1))) {
24861 auto SplatLSB = [&](EVT SplatVT) {
24862 // we need mask of all zeros or ones with same size of the other
24863 // operands.
24864 SDValue Neg = CmpVal;
24865 if (CmpVT.bitsGT(SplatVT))
24866 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24867 else if (CmpVT.bitsLT(SplatVT))
24868 Neg = DAG.getNode(
24869 ISD::AND, DL, SplatVT,
24870 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24871 DAG.getConstant(1, DL, SplatVT));
24872 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24873 };
24874
24875 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24877 return SplatLSB(VT);
24878
24879 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24880 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24881 isa<ConstantSDNode>(RHS)) {
24882 SDValue Mask = SplatLSB(VT);
24883 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24884 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24885 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24886 }
24887
24888 SDValue Src1, Src2;
24889 auto isIdentityPatternZero = [&]() {
24890 switch (RHS.getOpcode()) {
24891 default:
24892 break;
24893 case ISD::OR:
24894 case ISD::XOR:
24895 case ISD::ADD:
24896 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24897 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24898 Src2 = LHS;
24899 return true;
24900 }
24901 break;
24902 case ISD::SHL:
24903 case ISD::SRA:
24904 case ISD::SRL:
24905 case ISD::SUB:
24906 if (RHS.getOperand(0) == LHS) {
24907 Src1 = RHS.getOperand(1);
24908 Src2 = LHS;
24909 return true;
24910 }
24911 break;
24912 }
24913 return false;
24914 };
24915
24916 auto isIdentityPatternOnes = [&]() {
24917 switch (LHS.getOpcode()) {
24918 default:
24919 break;
24920 case ISD::AND:
24921 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24922 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24923 Src2 = RHS;
24924 return true;
24925 }
24926 break;
24927 }
24928 return false;
24929 };
24930
24931 // Convert 'identity' patterns (iff X is 0 or 1):
24932 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24933 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24934 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24935 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24936 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24937 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24938 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24939 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24940 SDValue Mask = SplatLSB(Src1.getValueType());
24941 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24942 Src1); // Mask & z
24943 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24944 }
24945 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24946 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24947 SDValue Mask = SplatLSB(VT);
24948 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24949 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24950 }
24951 }
24952
24953 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24956 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24957
24958 // 'X - 1' sets the carry flag if X == 0.
24959 // '0 - X' sets the carry flag if X != 0.
24960 // Convert the carry flag to a -1/0 mask with sbb:
24961 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24962 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24963 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24964 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24965 SDValue Sub;
24966 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24967 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24968 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24969 } else {
24970 SDValue One = DAG.getConstant(1, DL, CmpVT);
24971 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24972 }
24973 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24974 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24975 Sub.getValue(1));
24976 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24977 }
24978
24979 return SDValue();
24980}
24981
24982SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24983 bool AddTest = true;
24984 SDValue Cond = Op.getOperand(0);
24985 SDValue Op1 = Op.getOperand(1);
24986 SDValue Op2 = Op.getOperand(2);
24987 SDLoc DL(Op);
24988 MVT VT = Op1.getSimpleValueType();
24989 SDValue CC;
24990
24991 if (isSoftF16(VT, Subtarget)) {
24992 MVT NVT = VT.changeTypeToInteger();
24993 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24994 DAG.getBitcast(NVT, Op1),
24995 DAG.getBitcast(NVT, Op2)));
24996 }
24997
24998 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24999 // are available or VBLENDV if AVX is available.
25000 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25001 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25002 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25003 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25004 bool IsAlwaysSignaling;
25005 unsigned SSECC =
25006 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25007 CondOp0, CondOp1, IsAlwaysSignaling);
25008
25009 if (Subtarget.hasAVX512()) {
25010 SDValue Cmp =
25011 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25012 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25013 assert(!VT.isVector() && "Not a scalar type?");
25014 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25015 }
25016
25017 if (SSECC < 8 || Subtarget.hasAVX()) {
25018 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25019 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25020
25021 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25022 // instead of 3 logic instructions for size savings and potentially speed.
25023 // Unfortunately, there is no scalar form of VBLENDV.
25024 //
25025 // If either operand is a +0.0 constant, don't try this. We can expect to
25026 // optimize away at least one of the logic instructions later in that
25027 // case, so that sequence would be faster than a variable blend.
25028 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25029 !isNullFPConstant(Op2)) {
25030 // Convert to vectors, do a VSELECT, and convert back to scalar.
25031 // All of the conversions should be optimized away.
25032 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25033 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25034 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25035 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25036
25037 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25038 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25039
25040 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25041
25042 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25043 DAG.getVectorIdxConstant(0, DL));
25044 }
25045 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25046 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25047 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25048 }
25049 }
25050
25051 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25052 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25053 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25054 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25055 }
25056
25057 if (Cond.getOpcode() == ISD::SETCC &&
25058 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25059 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25060 Cond = NewCond;
25061 // If the condition was updated, it's possible that the operands of the
25062 // select were also updated (for example, EmitTest has a RAUW). Refresh
25063 // the local references to the select operands in case they got stale.
25064 Op1 = Op.getOperand(1);
25065 Op2 = Op.getOperand(2);
25066 }
25067 }
25068
25069 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25070 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25071 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25072 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25073 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25074 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25075 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25076 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25077 if (Cond.getOpcode() == X86ISD::SETCC &&
25078 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25079 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25080 SDValue Cmp = Cond.getOperand(1);
25081 SDValue CmpOp0 = Cmp.getOperand(0);
25082 unsigned CondCode = Cond.getConstantOperandVal(0);
25083
25084 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25085 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25086 // handle to keep the CMP with 0. This should be removed by
25087 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25088 // cttz_zero_undef.
25089 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25090 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25091 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25092 };
25093 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25094 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25095 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25096 // Keep Cmp.
25097 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25098 DL, DAG, Subtarget)) {
25099 return R;
25100 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
25101 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25102 ((CondCode == X86::COND_S) || // smin(x, 0)
25103 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25104 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25105 //
25106 // If the comparison is testing for a positive value, we have to invert
25107 // the sign bit mask, so only do that transform if the target has a
25108 // bitwise 'and not' instruction (the invert is free).
25109 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25110 unsigned ShCt = VT.getSizeInBits() - 1;
25111 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25112 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25113 if (CondCode == X86::COND_G)
25114 Shift = DAG.getNOT(DL, Shift, VT);
25115 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25116 }
25117 }
25118
25119 // Look past (and (setcc_carry (cmp ...)), 1).
25120 if (Cond.getOpcode() == ISD::AND &&
25121 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25122 isOneConstant(Cond.getOperand(1)))
25123 Cond = Cond.getOperand(0);
25124
25125 // Attempt to fold "raw cond" cases by treating them as:
25126 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25127 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25128 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25129 Subtarget))
25130 return R;
25131
25132 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25133 // setting operand in place of the X86ISD::SETCC.
25134 unsigned CondOpcode = Cond.getOpcode();
25135 if (CondOpcode == X86ISD::SETCC ||
25136 CondOpcode == X86ISD::SETCC_CARRY) {
25137 CC = Cond.getOperand(0);
25138
25139 SDValue Cmp = Cond.getOperand(1);
25140 bool IllegalFPCMov = false;
25141 if (VT.isFloatingPoint() && !VT.isVector() &&
25142 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25143 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25144
25145 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25146 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25147 Cond = Cmp;
25148 AddTest = false;
25149 }
25150 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25151 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25152 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25153 SDValue Value;
25154 X86::CondCode X86Cond;
25155 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25156
25157 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25158 AddTest = false;
25159 }
25160
25161 if (AddTest) {
25162 // Look past the truncate if the high bits are known zero.
25164 Cond = Cond.getOperand(0);
25165
25166 // We know the result of AND is compared against zero. Try to match
25167 // it to BT.
25168 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25169 X86::CondCode X86CondCode;
25170 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25171 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25172 Cond = BT;
25173 AddTest = false;
25174 }
25175 }
25176 }
25177
25178 if (AddTest) {
25179 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25180 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25181 }
25182
25183 // a < b ? -1 : 0 -> RES = ~setcc_carry
25184 // a < b ? 0 : -1 -> RES = setcc_carry
25185 // a >= b ? -1 : 0 -> RES = setcc_carry
25186 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25187 if (Cond.getOpcode() == X86ISD::SUB) {
25188 unsigned CondCode = CC->getAsZExtVal();
25189
25190 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25191 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25192 (isNullConstant(Op1) || isNullConstant(Op2))) {
25193 SDValue Res =
25194 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25195 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25196 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25197 return DAG.getNOT(DL, Res, Res.getValueType());
25198 return Res;
25199 }
25200 }
25201
25202 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25203 // widen the cmov and push the truncate through. This avoids introducing a new
25204 // branch during isel and doesn't add any extensions.
25205 if (Op.getValueType() == MVT::i8 &&
25206 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25207 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25208 if (T1.getValueType() == T2.getValueType() &&
25209 // Exclude CopyFromReg to avoid partial register stalls.
25210 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25211 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25212 CC, Cond);
25213 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25214 }
25215 }
25216
25217 // Or finally, promote i8 cmovs if we have CMOV,
25218 // or i16 cmovs if it won't prevent folding a load.
25219 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25220 // legal, but EmitLoweredSelect() can not deal with these extensions
25221 // being inserted between two CMOV's. (in i16 case too TBN)
25222 // https://bugs.llvm.org/show_bug.cgi?id=40974
25223 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25224 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25225 !X86::mayFoldLoad(Op2, Subtarget))) {
25226 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25227 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25228 SDValue Ops[] = { Op2, Op1, CC, Cond };
25229 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25230 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25231 }
25232
25233 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25234 // condition is true.
25235 SDValue Ops[] = { Op2, Op1, CC, Cond };
25236 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25237}
25238
25240 const X86Subtarget &Subtarget,
25241 SelectionDAG &DAG) {
25242 MVT VT = Op->getSimpleValueType(0);
25243 SDValue In = Op->getOperand(0);
25244 MVT InVT = In.getSimpleValueType();
25245 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25246 MVT VTElt = VT.getVectorElementType();
25247 unsigned NumElts = VT.getVectorNumElements();
25248
25249 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25250 MVT ExtVT = VT;
25251 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25252 // If v16i32 is to be avoided, we'll need to split and concatenate.
25253 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25254 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25255
25256 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25257 }
25258
25259 // Widen to 512-bits if VLX is not supported.
25260 MVT WideVT = ExtVT;
25261 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25262 NumElts *= 512 / ExtVT.getSizeInBits();
25263 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25264 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25265 DAG.getVectorIdxConstant(0, dl));
25266 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25267 }
25268
25269 SDValue V;
25270 MVT WideEltVT = WideVT.getVectorElementType();
25271 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25272 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25273 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25274 } else {
25275 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25276 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25277 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25278 }
25279
25280 // Truncate if we had to extend i16/i8 above.
25281 if (VT != ExtVT) {
25282 WideVT = MVT::getVectorVT(VTElt, NumElts);
25283 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25284 }
25285
25286 // Extract back to 128/256-bit if we widened.
25287 if (WideVT != VT)
25288 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25289 DAG.getVectorIdxConstant(0, dl));
25290
25291 return V;
25292}
25293
25295 SelectionDAG &DAG) {
25296 SDValue In = Op->getOperand(0);
25297 MVT InVT = In.getSimpleValueType();
25298 SDLoc DL(Op);
25299
25300 if (InVT.getVectorElementType() == MVT::i1)
25301 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25302
25303 assert(Subtarget.hasAVX() && "Expected AVX support");
25304 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25305}
25306
25307// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25308// For sign extend this needs to handle all vector sizes and SSE4.1 and
25309// non-SSE4.1 targets. For zero extend this should only handle inputs of
25310// MVT::v64i8 when BWI is not supported, but AVX512 is.
25312 const X86Subtarget &Subtarget,
25313 SelectionDAG &DAG) {
25314 SDValue In = Op->getOperand(0);
25315 MVT VT = Op->getSimpleValueType(0);
25316 MVT InVT = In.getSimpleValueType();
25317
25318 MVT SVT = VT.getVectorElementType();
25319 MVT InSVT = InVT.getVectorElementType();
25321
25322 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25323 return SDValue();
25324 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25325 return SDValue();
25326 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25327 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25328 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25329 return SDValue();
25330
25331 SDLoc dl(Op);
25332 unsigned Opc = Op.getOpcode();
25333 unsigned NumElts = VT.getVectorNumElements();
25334
25335 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25336 // For 512-bit vectors, we need 128-bits or 256-bits.
25337 if (InVT.getSizeInBits() > 128) {
25338 // Input needs to be at least the same number of elements as output, and
25339 // at least 128-bits.
25340 int InSize = InSVT.getSizeInBits() * NumElts;
25341 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25342 InVT = In.getSimpleValueType();
25343 }
25344
25345 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25346 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25347 // need to be handled here for 256/512-bit results.
25348 if (Subtarget.hasInt256()) {
25349 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25350
25351 if (InVT.getVectorNumElements() != NumElts)
25352 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25353
25354 // FIXME: Apparently we create inreg operations that could be regular
25355 // extends.
25356 unsigned ExtOpc =
25359 return DAG.getNode(ExtOpc, dl, VT, In);
25360 }
25361
25362 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25363 if (Subtarget.hasAVX()) {
25364 assert(VT.is256BitVector() && "256-bit vector expected");
25365 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25366 int HalfNumElts = HalfVT.getVectorNumElements();
25367
25368 unsigned NumSrcElts = InVT.getVectorNumElements();
25369 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25370 for (int i = 0; i != HalfNumElts; ++i)
25371 HiMask[i] = HalfNumElts + i;
25372
25373 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25374 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25375 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25376 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25377 }
25378
25379 // We should only get here for sign extend.
25380 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25381 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25382 unsigned InNumElts = InVT.getVectorNumElements();
25383
25384 // If the source elements are already all-signbits, we don't need to extend,
25385 // just splat the elements.
25386 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25387 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25388 unsigned Scale = InNumElts / NumElts;
25389 SmallVector<int, 16> ShuffleMask;
25390 for (unsigned I = 0; I != NumElts; ++I)
25391 ShuffleMask.append(Scale, I);
25392 return DAG.getBitcast(VT,
25393 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25394 }
25395
25396 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25397 SDValue Curr = In;
25398 SDValue SignExt = Curr;
25399
25400 // As SRAI is only available on i16/i32 types, we expand only up to i32
25401 // and handle i64 separately.
25402 if (InVT != MVT::v4i32) {
25403 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25404
25405 unsigned DestWidth = DestVT.getScalarSizeInBits();
25406 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25407 unsigned DestElts = DestVT.getVectorNumElements();
25408
25409 // Build a shuffle mask that takes each input element and places it in the
25410 // MSBs of the new element size.
25411 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25412 for (unsigned i = 0; i != DestElts; ++i)
25413 Mask[i * Scale + (Scale - 1)] = i;
25414
25415 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25416 Curr = DAG.getBitcast(DestVT, Curr);
25417
25418 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25419 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25420 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25421 }
25422
25423 if (VT == MVT::v2i64) {
25424 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25425 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25426 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25427 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25428 SignExt = DAG.getBitcast(VT, SignExt);
25429 }
25430
25431 return SignExt;
25432}
25433
25435 SelectionDAG &DAG) {
25436 MVT VT = Op->getSimpleValueType(0);
25437 SDValue In = Op->getOperand(0);
25438 MVT InVT = In.getSimpleValueType();
25439 SDLoc dl(Op);
25440
25441 if (InVT.getVectorElementType() == MVT::i1)
25442 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25443
25444 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25446 "Expected same number of elements");
25447 assert((VT.getVectorElementType() == MVT::i16 ||
25448 VT.getVectorElementType() == MVT::i32 ||
25449 VT.getVectorElementType() == MVT::i64) &&
25450 "Unexpected element type");
25451 assert((InVT.getVectorElementType() == MVT::i8 ||
25452 InVT.getVectorElementType() == MVT::i16 ||
25453 InVT.getVectorElementType() == MVT::i32) &&
25454 "Unexpected element type");
25455
25456 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25457 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25458 return splitVectorIntUnary(Op, DAG, dl);
25459 }
25460
25461 if (Subtarget.hasInt256())
25462 return Op;
25463
25464 // Optimize vectors in AVX mode
25465 // Sign extend v8i16 to v8i32 and
25466 // v4i32 to v4i64
25467 //
25468 // Divide input vector into two parts
25469 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25470 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25471 // concat the vectors to original VT
25472 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25473 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25474
25475 unsigned NumElems = InVT.getVectorNumElements();
25476 SmallVector<int,8> ShufMask(NumElems, -1);
25477 for (unsigned i = 0; i != NumElems/2; ++i)
25478 ShufMask[i] = i + NumElems/2;
25479
25480 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25481 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25482
25483 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25484}
25485
25486/// Change a vector store into a pair of half-size vector stores.
25488 SDValue StoredVal = Store->getValue();
25489 assert((StoredVal.getValueType().is256BitVector() ||
25490 StoredVal.getValueType().is512BitVector()) &&
25491 "Expecting 256/512-bit op");
25492
25493 // Splitting volatile memory ops is not allowed unless the operation was not
25494 // legal to begin with. Assume the input store is legal (this transform is
25495 // only used for targets with AVX). Note: It is possible that we have an
25496 // illegal type like v2i128, and so we could allow splitting a volatile store
25497 // in that case if that is important.
25498 if (!Store->isSimple())
25499 return SDValue();
25500
25501 SDLoc DL(Store);
25502 SDValue Value0, Value1;
25503 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25504 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25505 SDValue Ptr0 = Store->getBasePtr();
25506 SDValue Ptr1 =
25507 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25508 SDValue Ch0 =
25509 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25510 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25511 SDValue Ch1 =
25512 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25513 Store->getPointerInfo().getWithOffset(HalfOffset),
25514 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25515 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25516}
25517
25518/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25519/// type.
25521 SelectionDAG &DAG) {
25522 SDValue StoredVal = Store->getValue();
25523 assert(StoreVT.is128BitVector() &&
25524 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25525 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25526
25527 // Splitting volatile memory ops is not allowed unless the operation was not
25528 // legal to begin with. We are assuming the input op is legal (this transform
25529 // is only used for targets with AVX).
25530 if (!Store->isSimple())
25531 return SDValue();
25532
25533 MVT StoreSVT = StoreVT.getScalarType();
25534 unsigned NumElems = StoreVT.getVectorNumElements();
25535 unsigned ScalarSize = StoreSVT.getStoreSize();
25536
25537 SDLoc DL(Store);
25539 for (unsigned i = 0; i != NumElems; ++i) {
25540 unsigned Offset = i * ScalarSize;
25541 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25543 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25544 DAG.getVectorIdxConstant(i, DL));
25545 SDValue Ch =
25546 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25547 Store->getPointerInfo().getWithOffset(Offset),
25548 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25549 Stores.push_back(Ch);
25550 }
25551 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25552}
25553
25554static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25555 SelectionDAG &DAG) {
25556 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25557 SDLoc dl(St);
25558 SDValue StoredVal = St->getValue();
25559
25560 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25561 if (StoredVal.getValueType().isVector() &&
25562 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25563 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25564 assert(NumElts <= 8 && "Unexpected VT");
25565 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25566 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25567 "Expected AVX512F without AVX512DQI");
25568
25569 // We must pad with zeros to ensure we store zeroes to any unused bits.
25570 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25571 DAG.getUNDEF(MVT::v16i1), StoredVal,
25572 DAG.getVectorIdxConstant(0, dl));
25573 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25574 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25575 // Make sure we store zeros in the extra bits.
25576 if (NumElts < 8)
25577 StoredVal = DAG.getZeroExtendInReg(
25578 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25579
25580 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25581 St->getPointerInfo(), St->getBaseAlign(),
25582 St->getMemOperand()->getFlags());
25583 }
25584
25585 if (St->isTruncatingStore())
25586 return SDValue();
25587
25588 // If this is a 256/512-bit store of concatenated ops, we are better off
25589 // splitting that store into two half-size stores. This avoids spurious use of
25590 // concatenated ops and each half can execute independently. Some cores would
25591 // split the op into halves anyway, so the concat is purely an extra op.
25592 MVT StoreVT = StoredVal.getSimpleValueType();
25593 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25594 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25595 return splitVectorStore(St, DAG);
25596 return SDValue();
25597 }
25598
25599 if (StoreVT.is32BitVector())
25600 return SDValue();
25601
25602 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25603 assert(StoreVT.is64BitVector() && "Unexpected VT");
25604 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25606 "Unexpected type action!");
25607
25608 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25609 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25610 DAG.getUNDEF(StoreVT));
25611
25612 if (Subtarget.hasSSE2()) {
25613 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25614 // and store it.
25615 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25616 MVT CastVT = MVT::getVectorVT(StVT, 2);
25617 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25618 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25619 DAG.getVectorIdxConstant(0, dl));
25620
25621 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25622 St->getPointerInfo(), St->getBaseAlign(),
25623 St->getMemOperand()->getFlags());
25624 }
25625 assert(Subtarget.hasSSE1() && "Expected SSE");
25626 SDVTList Tys = DAG.getVTList(MVT::Other);
25627 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25628 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25629 St->getMemOperand());
25630}
25631
25632// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25633// may emit an illegal shuffle but the expansion is still better than scalar
25634// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25635// we'll emit a shuffle and a arithmetic shift.
25636// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25637// TODO: It is possible to support ZExt by zeroing the undef values during
25638// the shuffle phase or after the shuffle.
25639static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25640 SelectionDAG &DAG) {
25641 MVT RegVT = Op.getSimpleValueType();
25642 assert(RegVT.isVector() && "We only custom lower vector loads.");
25643 assert(RegVT.isInteger() &&
25644 "We only custom lower integer vector loads.");
25645
25646 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25647 SDLoc dl(Ld);
25648
25649 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25650 if (RegVT.getVectorElementType() == MVT::i1) {
25651 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25652 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25653 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25654 "Expected AVX512F without AVX512DQI");
25655
25656 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25657 Ld->getPointerInfo(), Ld->getBaseAlign(),
25658 Ld->getMemOperand()->getFlags());
25659
25660 // Replace chain users with the new chain.
25661 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25662
25663 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25664 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25665 DAG.getBitcast(MVT::v16i1, Val),
25666 DAG.getVectorIdxConstant(0, dl));
25667 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25668 }
25669
25670 return SDValue();
25671}
25672
25673/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25674/// each of which has no other use apart from the AND / OR.
25675static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25676 Opc = Op.getOpcode();
25677 if (Opc != ISD::OR && Opc != ISD::AND)
25678 return false;
25679 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25680 Op.getOperand(0).hasOneUse() &&
25681 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25682 Op.getOperand(1).hasOneUse());
25683}
25684
25685SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25686 SDValue Chain = Op.getOperand(0);
25687 SDValue Cond = Op.getOperand(1);
25688 SDValue Dest = Op.getOperand(2);
25689 SDLoc dl(Op);
25690
25691 // Bail out when we don't have native compare instructions.
25692 if (Cond.getOpcode() == ISD::SETCC &&
25693 Cond.getOperand(0).getValueType() != MVT::f128 &&
25694 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25695 SDValue LHS = Cond.getOperand(0);
25696 SDValue RHS = Cond.getOperand(1);
25697 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25698
25699 // Special case for
25700 // setcc([su]{add,sub,mul}o == 0)
25701 // setcc([su]{add,sub,mul}o != 1)
25702 if (ISD::isOverflowIntrOpRes(LHS) &&
25703 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25704 (isNullConstant(RHS) || isOneConstant(RHS))) {
25705 SDValue Value, Overflow;
25706 X86::CondCode X86Cond;
25707 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25708
25709 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25710 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25711
25712 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25713 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25714 Overflow, Op->getFlags());
25715 }
25716
25717 if (LHS.getSimpleValueType().isInteger()) {
25718 SDValue CCVal;
25719 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25720 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25721 EFLAGS, Op->getFlags());
25722 }
25723
25724 if (CC == ISD::SETOEQ) {
25725 // For FCMP_OEQ, we can emit
25726 // two branches instead of an explicit AND instruction with a
25727 // separate test. However, we only do this if this block doesn't
25728 // have a fall-through edge, because this requires an explicit
25729 // jmp when the condition is false.
25730 if (Op.getNode()->hasOneUse()) {
25731 SDNode *User = *Op.getNode()->user_begin();
25732 // Look for an unconditional branch following this conditional branch.
25733 // We need this because we need to reverse the successors in order
25734 // to implement FCMP_OEQ.
25735 if (User->getOpcode() == ISD::BR) {
25736 SDValue FalseBB = User->getOperand(1);
25737 SDNode *NewBR =
25738 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25739 assert(NewBR == User);
25740 (void)NewBR;
25741 Dest = FalseBB;
25742
25743 SDValue Cmp =
25744 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25745 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25746 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25747 CCVal, Cmp, Op->getFlags());
25748 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25749 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25750 Cmp, Op->getFlags());
25751 }
25752 }
25753 } else if (CC == ISD::SETUNE) {
25754 // For FCMP_UNE, we can emit
25755 // two branches instead of an explicit OR instruction with a
25756 // separate test.
25757 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25758 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25759 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25760 Cmp, Op->getFlags());
25761 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25762 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25763 Cmp, Op->getFlags());
25764 } else {
25765 X86::CondCode X86Cond =
25766 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25767 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25768 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25769 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25770 Cmp, Op->getFlags());
25771 }
25772 }
25773
25775 SDValue Value, Overflow;
25776 X86::CondCode X86Cond;
25777 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25778
25779 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25780 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25781 Overflow, Op->getFlags());
25782 }
25783
25784 // Look past the truncate if the high bits are known zero.
25786 Cond = Cond.getOperand(0);
25787
25788 EVT CondVT = Cond.getValueType();
25789
25790 // Add an AND with 1 if we don't already have one.
25791 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25792 Cond =
25793 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25794
25795 SDValue LHS = Cond;
25796 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25797
25798 SDValue CCVal;
25799 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25800 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25801 Op->getFlags());
25802}
25803
25804// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25805// Calls to _alloca are needed to probe the stack when allocating more than 4k
25806// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25807// that the guard pages used by the OS virtual memory manager are allocated in
25808// correct sequence.
25809SDValue
25810X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25811 SelectionDAG &DAG) const {
25813 bool SplitStack = MF.shouldSplitStack();
25814 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25815 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25816 SplitStack || EmitStackProbeCall;
25817 SDLoc dl(Op);
25818
25819 // Get the inputs.
25820 SDNode *Node = Op.getNode();
25821 SDValue Chain = Op.getOperand(0);
25822 SDValue Size = Op.getOperand(1);
25823 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25824 EVT VT = Node->getValueType(0);
25825
25826 // Chain the dynamic stack allocation so that it doesn't modify the stack
25827 // pointer when other instructions are using the stack.
25828 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25829
25830 bool Is64Bit = Subtarget.is64Bit();
25831 MVT SPTy = Op.getValueType().getSimpleVT();
25832
25834 if (!Lower) {
25835 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25837 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25838 " not tell us which reg is the stack pointer!");
25839
25840 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25841 const Align StackAlign = TFI.getStackAlign();
25842 if (hasInlineStackProbe(MF)) {
25843 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25844 {Chain, Size});
25845 Chain = Result.getValue(1);
25846 } else {
25847 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25848 Chain = SP.getValue(1);
25849 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25850 }
25851 if (Alignment && *Alignment > StackAlign)
25852 Result = DAG.getNode(
25853 ISD::AND, dl, VT, Result,
25854 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25855 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25856 } else if (SplitStack) {
25857 if (Is64Bit) {
25858 // The 64 bit implementation of segmented stacks needs to clobber both r10
25859 // r11. This makes it impossible to use it along with nested parameters.
25860 const Function &F = MF.getFunction();
25861 for (const auto &A : F.args()) {
25862 if (A.hasNestAttr())
25863 report_fatal_error("Cannot use segmented stacks with functions that "
25864 "have nested arguments.");
25865 }
25866 }
25867
25868 Result =
25869 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25870 Chain = Result.getValue(1);
25871 } else {
25872 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25873 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25874 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25875
25876 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25877 Register SPReg = RegInfo->getStackRegister();
25878 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25879 Chain = SP.getValue(1);
25880
25881 if (Alignment) {
25882 SP = DAG.getNode(
25883 ISD::AND, dl, VT, SP.getValue(0),
25884 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25885 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25886 }
25887
25888 Result = SP;
25889 }
25890
25891 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25892
25893 SDValue Ops[2] = {Result, Chain};
25894 return DAG.getMergeValues(Ops, dl);
25895}
25896
25897SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25899 SDValue Ptr = Op.getOperand(1);
25900 EVT PtrVT = Ptr.getValueType();
25901
25903
25904 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25905 SDLoc DL(Op);
25906
25907 if (!Subtarget.is64Bit() ||
25908 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25909 // vastart just stores the address of the VarArgsFrameIndex slot into the
25910 // memory location argument.
25911 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25912 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25913 }
25914
25915 // __va_list_tag:
25916 // gp_offset (0 - 6 * 8)
25917 // fp_offset (48 - 48 + 8 * 16)
25918 // overflow_arg_area (point to parameters coming in memory).
25919 // reg_save_area
25921 SDValue FIN = Op.getOperand(1);
25922 // Store gp_offset
25923 SDValue Store = DAG.getStore(
25924 Op.getOperand(0), DL,
25925 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25926 MachinePointerInfo(SV));
25927 MemOps.push_back(Store);
25928
25929 // Store fp_offset
25930 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25931 Store = DAG.getStore(
25932 Op.getOperand(0), DL,
25933 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25934 MachinePointerInfo(SV, 4));
25935 MemOps.push_back(Store);
25936
25937 // Store ptr to overflow_arg_area
25938 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25939 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25940 Store =
25941 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25942 MemOps.push_back(Store);
25943
25944 // Store ptr to reg_save_area.
25945 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25946 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25947 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25948 Store = DAG.getStore(
25949 Op.getOperand(0), DL, RSFIN, FIN,
25950 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25951 MemOps.push_back(Store);
25952 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25953}
25954
25955SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25956 assert(Subtarget.is64Bit() &&
25957 "LowerVAARG only handles 64-bit va_arg!");
25958 assert(Op.getNumOperands() == 4);
25959
25961 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25962 // The Win64 ABI uses char* instead of a structure.
25963 return DAG.expandVAArg(Op.getNode());
25964
25965 SDValue Chain = Op.getOperand(0);
25966 SDValue SrcPtr = Op.getOperand(1);
25967 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25968 unsigned Align = Op.getConstantOperandVal(3);
25969 SDLoc dl(Op);
25970
25971 EVT ArgVT = Op.getNode()->getValueType(0);
25972 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25973 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25974 uint8_t ArgMode;
25975
25976 // Decide which area this value should be read from.
25977 // TODO: Implement the AMD64 ABI in its entirety. This simple
25978 // selection mechanism works only for the basic types.
25979 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25980 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25981 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25982 } else {
25983 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25984 "Unhandled argument type in LowerVAARG");
25985 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25986 }
25987
25988 if (ArgMode == 2) {
25989 // Make sure using fp_offset makes sense.
25990 assert(!Subtarget.useSoftFloat() &&
25991 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25992 Subtarget.hasSSE1());
25993 }
25994
25995 // Insert VAARG node into the DAG
25996 // VAARG returns two values: Variable Argument Address, Chain
25997 SDValue InstOps[] = {Chain, SrcPtr,
25998 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25999 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26000 DAG.getTargetConstant(Align, dl, MVT::i32)};
26001 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26004 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26005 /*Alignment=*/std::nullopt,
26007 Chain = VAARG.getValue(1);
26008
26009 // Load the next argument and return it
26010 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26011}
26012
26013static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26014 SelectionDAG &DAG) {
26015 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26016 // where a va_list is still an i8*.
26017 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26018 if (Subtarget.isCallingConvWin64(
26020 // Probably a Win64 va_copy.
26021 return DAG.expandVACopy(Op.getNode());
26022
26023 SDValue Chain = Op.getOperand(0);
26024 SDValue DstPtr = Op.getOperand(1);
26025 SDValue SrcPtr = Op.getOperand(2);
26026 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26027 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26028 SDLoc DL(Op);
26029
26030 return DAG.getMemcpy(
26031 Chain, DL, DstPtr, SrcPtr,
26032 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26033 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26034 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26035 MachinePointerInfo(SrcSV));
26036}
26037
26038// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26039static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26040 switch (Opc) {
26041 case ISD::SHL:
26042 case X86ISD::VSHL:
26043 case X86ISD::VSHLI:
26044 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26045 case ISD::SRL:
26046 case X86ISD::VSRL:
26047 case X86ISD::VSRLI:
26048 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26049 case ISD::SRA:
26050 case X86ISD::VSRA:
26051 case X86ISD::VSRAI:
26052 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26053 }
26054 llvm_unreachable("Unknown target vector shift node");
26055}
26056
26057/// Handle vector element shifts where the shift amount is a constant.
26058/// Takes immediate version of shift as input.
26059static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26060 SDValue SrcOp, uint64_t ShiftAmt,
26061 SelectionDAG &DAG) {
26062 MVT ElementType = VT.getVectorElementType();
26063
26064 // Bitcast the source vector to the output type, this is mainly necessary for
26065 // vXi8/vXi64 shifts.
26066 if (VT != SrcOp.getSimpleValueType())
26067 SrcOp = DAG.getBitcast(VT, SrcOp);
26068
26069 // Fold this packed shift into its first operand if ShiftAmt is 0.
26070 if (ShiftAmt == 0)
26071 return SrcOp;
26072
26073 // Check for ShiftAmt >= element width
26074 if (ShiftAmt >= ElementType.getSizeInBits()) {
26075 if (Opc == X86ISD::VSRAI)
26076 ShiftAmt = ElementType.getSizeInBits() - 1;
26077 else
26078 return DAG.getConstant(0, dl, VT);
26079 }
26080
26082 && "Unknown target vector shift-by-constant node");
26083
26084 // Fold this packed vector shift into a build vector if SrcOp is a
26085 // vector of Constants or UNDEFs.
26087 unsigned ShiftOpc;
26088 switch (Opc) {
26089 default: llvm_unreachable("Unknown opcode!");
26090 case X86ISD::VSHLI:
26091 ShiftOpc = ISD::SHL;
26092 break;
26093 case X86ISD::VSRLI:
26094 ShiftOpc = ISD::SRL;
26095 break;
26096 case X86ISD::VSRAI:
26097 ShiftOpc = ISD::SRA;
26098 break;
26099 }
26100
26101 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26102 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26103 return C;
26104 }
26105
26106 return DAG.getNode(Opc, dl, VT, SrcOp,
26107 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26108}
26109
26110/// Handle vector element shifts by a splat shift amount
26111static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26112 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26113 const X86Subtarget &Subtarget,
26114 SelectionDAG &DAG) {
26115 MVT AmtVT = ShAmt.getSimpleValueType();
26116 assert(AmtVT.isVector() && "Vector shift type mismatch");
26117 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26118 "Illegal vector splat index");
26119
26120 // Move the splat element to the bottom element.
26121 if (ShAmtIdx != 0) {
26122 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26123 Mask[0] = ShAmtIdx;
26124 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26125 }
26126
26127 // Peek through any zext node if we can get back to a 128-bit source.
26128 if (AmtVT.getScalarSizeInBits() == 64 &&
26129 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26131 ShAmt.getOperand(0).getValueType().isSimple() &&
26132 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26133 ShAmt = ShAmt.getOperand(0);
26134 AmtVT = ShAmt.getSimpleValueType();
26135 }
26136
26137 // See if we can mask off the upper elements using the existing source node.
26138 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26139 // do this for vXi64 types.
26140 bool IsMasked = false;
26141 if (AmtVT.getScalarSizeInBits() < 64) {
26142 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26143 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26144 // If the shift amount has come from a scalar, then zero-extend the scalar
26145 // before moving to the vector.
26146 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26147 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26148 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26149 AmtVT = MVT::v4i32;
26150 IsMasked = true;
26151 } else if (ShAmt.getOpcode() == ISD::AND) {
26152 // See if the shift amount is already masked (e.g. for rotation modulo),
26153 // then we can zero-extend it by setting all the other mask elements to
26154 // zero.
26155 SmallVector<SDValue> MaskElts(
26156 AmtVT.getVectorNumElements(),
26157 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26158 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26159 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26160 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26161 {ShAmt.getOperand(1), Mask}))) {
26162 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26163 IsMasked = true;
26164 }
26165 }
26166 }
26167
26168 // Extract if the shift amount vector is larger than 128-bits.
26169 if (AmtVT.getSizeInBits() > 128) {
26170 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26171 AmtVT = ShAmt.getSimpleValueType();
26172 }
26173
26174 // Zero-extend bottom element to v2i64 vector type, either by extension or
26175 // shuffle masking.
26176 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26177 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26178 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26179 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26180 } else if (Subtarget.hasSSE41()) {
26181 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26182 MVT::v2i64, ShAmt);
26183 } else {
26184 SDValue ByteShift = DAG.getTargetConstant(
26185 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26186 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26187 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26188 ByteShift);
26189 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26190 ByteShift);
26191 }
26192 }
26193
26194 // Change opcode to non-immediate version.
26196
26197 // The return type has to be a 128-bit type with the same element
26198 // type as the input type.
26199 MVT EltVT = VT.getVectorElementType();
26200 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26201
26202 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26203 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26204}
26205
26206/// Return Mask with the necessary casting or extending
26207/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26208static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26209 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26210 const SDLoc &dl) {
26211
26212 if (isAllOnesConstant(Mask))
26213 return DAG.getConstant(1, dl, MaskVT);
26214 if (X86::isZeroNode(Mask))
26215 return DAG.getConstant(0, dl, MaskVT);
26216
26217 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26218
26219 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26220 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26221 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26222 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26223 SDValue Lo, Hi;
26224 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26225 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26226 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26227 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26228 } else {
26229 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26230 Mask.getSimpleValueType().getSizeInBits());
26231 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26232 // are extracted by EXTRACT_SUBVECTOR.
26233 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26234 DAG.getBitcast(BitcastVT, Mask),
26235 DAG.getVectorIdxConstant(0, dl));
26236 }
26237}
26238
26239/// Return (and \p Op, \p Mask) for compare instructions or
26240/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26241/// necessary casting or extending for \p Mask when lowering masking intrinsics
26243 SDValue PreservedSrc,
26244 const X86Subtarget &Subtarget,
26245 SelectionDAG &DAG) {
26246 MVT VT = Op.getSimpleValueType();
26247 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26248 unsigned OpcodeSelect = ISD::VSELECT;
26249 SDLoc dl(Op);
26250
26251 if (isAllOnesConstant(Mask))
26252 return Op;
26253
26254 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26255
26256 if (PreservedSrc.isUndef())
26257 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26258 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26259}
26260
26261/// Creates an SDNode for a predicated scalar operation.
26262/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26263/// The mask is coming as MVT::i8 and it should be transformed
26264/// to MVT::v1i1 while lowering masking intrinsics.
26265/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26266/// "X86select" instead of "vselect". We just can't create the "vselect" node
26267/// for a scalar instruction.
26269 SDValue PreservedSrc,
26270 const X86Subtarget &Subtarget,
26271 SelectionDAG &DAG) {
26272
26273 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
26274 if (MaskConst->getZExtValue() & 0x1)
26275 return Op;
26276
26277 MVT VT = Op.getSimpleValueType();
26278 SDLoc dl(Op);
26279
26280 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26281 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26282 DAG.getBitcast(MVT::v8i1, Mask),
26283 DAG.getVectorIdxConstant(0, dl));
26284 if (Op.getOpcode() == X86ISD::FSETCCM ||
26285 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26286 Op.getOpcode() == X86ISD::VFPCLASSS)
26287 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26288
26289 if (PreservedSrc.isUndef())
26290 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26291 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26292}
26293
26295 if (!Fn->hasPersonalityFn())
26297 "querying registration node size for function without personality");
26298 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26299 // WinEHStatePass for the full struct definition.
26300 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26301 case EHPersonality::MSVC_X86SEH: return 24;
26302 case EHPersonality::MSVC_CXX: return 16;
26303 default: break;
26304 }
26306 "can only recover FP for 32-bit MSVC EH personality functions");
26307}
26308
26309/// When the MSVC runtime transfers control to us, either to an outlined
26310/// function or when returning to a parent frame after catching an exception, we
26311/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26312/// Here's the math:
26313/// RegNodeBase = EntryEBP - RegNodeSize
26314/// ParentFP = RegNodeBase - ParentFrameOffset
26315/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26316/// subtracting the offset (negative on x86) takes us back to the parent FP.
26318 SDValue EntryEBP) {
26320 SDLoc dl;
26321
26322 // It's possible that the parent function no longer has a personality function
26323 // if the exceptional code was optimized away, in which case we just return
26324 // the incoming EBP.
26325 if (!Fn->hasPersonalityFn())
26326 return EntryEBP;
26327
26328 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26329 // registration, or the .set_setframe offset.
26332 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26333 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26334 SDValue ParentFrameOffset =
26335 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26336
26337 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26338 // prologue to RBP in the parent function.
26339 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26340 if (Subtarget.is64Bit())
26341 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26342
26343 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26344 // RegNodeBase = EntryEBP - RegNodeSize
26345 // ParentFP = RegNodeBase - ParentFrameOffset
26346 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26347 DAG.getConstant(RegNodeSize, dl, PtrVT));
26348 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26349}
26350
26351SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26352 SelectionDAG &DAG) const {
26353 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26354 auto isRoundModeCurDirection = [](SDValue Rnd) {
26355 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26356 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26357
26358 return false;
26359 };
26360 auto isRoundModeSAE = [](SDValue Rnd) {
26361 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26362 unsigned RC = C->getZExtValue();
26364 // Clear the NO_EXC bit and check remaining bits.
26366 // As a convenience we allow no other bits or explicitly
26367 // current direction.
26368 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26369 }
26370 }
26371
26372 return false;
26373 };
26374 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26375 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26376 RC = C->getZExtValue();
26378 // Clear the NO_EXC bit and check remaining bits.
26384 }
26385 }
26386
26387 return false;
26388 };
26389
26390 SDLoc dl(Op);
26391 unsigned IntNo = Op.getConstantOperandVal(0);
26392 MVT VT = Op.getSimpleValueType();
26393 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26394
26395 // Propagate flags from original node to transformed node(s).
26396 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26397
26398 if (IntrData) {
26399 switch(IntrData->Type) {
26400 case INTR_TYPE_1OP: {
26401 // We specify 2 possible opcodes for intrinsics with rounding modes.
26402 // First, we check if the intrinsic may have non-default rounding mode,
26403 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26404 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26405 if (IntrWithRoundingModeOpcode != 0) {
26406 SDValue Rnd = Op.getOperand(2);
26407 unsigned RC = 0;
26408 if (isRoundModeSAEToX(Rnd, RC))
26409 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26410 Op.getOperand(1),
26411 DAG.getTargetConstant(RC, dl, MVT::i32));
26412 if (!isRoundModeCurDirection(Rnd))
26413 return SDValue();
26414 }
26415 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26416 Op.getOperand(1));
26417 }
26418 case INTR_TYPE_1OP_SAE: {
26419 SDValue Sae = Op.getOperand(2);
26420
26421 unsigned Opc;
26422 if (isRoundModeCurDirection(Sae))
26423 Opc = IntrData->Opc0;
26424 else if (isRoundModeSAE(Sae))
26425 Opc = IntrData->Opc1;
26426 else
26427 return SDValue();
26428
26429 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26430 }
26431 case INTR_TYPE_2OP: {
26432 SDValue Src2 = Op.getOperand(2);
26433
26434 // We specify 2 possible opcodes for intrinsics with rounding modes.
26435 // First, we check if the intrinsic may have non-default rounding mode,
26436 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26437 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26438 if (IntrWithRoundingModeOpcode != 0) {
26439 SDValue Rnd = Op.getOperand(3);
26440 unsigned RC = 0;
26441 if (isRoundModeSAEToX(Rnd, RC))
26442 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26443 Op.getOperand(1), Src2,
26444 DAG.getTargetConstant(RC, dl, MVT::i32));
26445 if (!isRoundModeCurDirection(Rnd))
26446 return SDValue();
26447 }
26448
26449 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26450 Op.getOperand(1), Src2);
26451 }
26452 case INTR_TYPE_2OP_SAE: {
26453 SDValue Sae = Op.getOperand(3);
26454
26455 unsigned Opc;
26456 if (isRoundModeCurDirection(Sae))
26457 Opc = IntrData->Opc0;
26458 else if (isRoundModeSAE(Sae))
26459 Opc = IntrData->Opc1;
26460 else
26461 return SDValue();
26462
26463 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26464 Op.getOperand(2));
26465 }
26466 case INTR_TYPE_3OP:
26467 case INTR_TYPE_3OP_IMM8: {
26468 SDValue Src1 = Op.getOperand(1);
26469 SDValue Src2 = Op.getOperand(2);
26470 SDValue Src3 = Op.getOperand(3);
26471
26472 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26473 Src3.getValueType() != MVT::i8) {
26474 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26475 }
26476
26477 // We specify 2 possible opcodes for intrinsics with rounding modes.
26478 // First, we check if the intrinsic may have non-default rounding mode,
26479 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26480 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26481 if (IntrWithRoundingModeOpcode != 0) {
26482 SDValue Rnd = Op.getOperand(4);
26483 unsigned RC = 0;
26484 if (isRoundModeSAEToX(Rnd, RC))
26485 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26486 Src1, Src2, Src3,
26487 DAG.getTargetConstant(RC, dl, MVT::i32));
26488 if (!isRoundModeCurDirection(Rnd))
26489 return SDValue();
26490 }
26491
26492 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26493 {Src1, Src2, Src3});
26494 }
26495 case INTR_TYPE_4OP_IMM8: {
26496 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26497 SDValue Src4 = Op.getOperand(4);
26498 if (Src4.getValueType() != MVT::i8) {
26499 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26500 }
26501
26502 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26503 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26504 Src4);
26505 }
26506 case INTR_TYPE_1OP_MASK: {
26507 SDValue Src = Op.getOperand(1);
26508 SDValue PassThru = Op.getOperand(2);
26509 SDValue Mask = Op.getOperand(3);
26510 // We add rounding mode to the Node when
26511 // - RC Opcode is specified and
26512 // - RC is not "current direction".
26513 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26514 if (IntrWithRoundingModeOpcode != 0) {
26515 SDValue Rnd = Op.getOperand(4);
26516 unsigned RC = 0;
26517 if (isRoundModeSAEToX(Rnd, RC))
26518 return getVectorMaskingNode(
26519 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26520 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26521 Mask, PassThru, Subtarget, DAG);
26522 if (!isRoundModeCurDirection(Rnd))
26523 return SDValue();
26524 }
26525 return getVectorMaskingNode(
26526 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26527 Subtarget, DAG);
26528 }
26530 SDValue Src = Op.getOperand(1);
26531 SDValue PassThru = Op.getOperand(2);
26532 SDValue Mask = Op.getOperand(3);
26533 SDValue Rnd = Op.getOperand(4);
26534
26535 unsigned Opc;
26536 if (isRoundModeCurDirection(Rnd))
26537 Opc = IntrData->Opc0;
26538 else if (isRoundModeSAE(Rnd))
26539 Opc = IntrData->Opc1;
26540 else
26541 return SDValue();
26542
26543 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26544 Subtarget, DAG);
26545 }
26546 case INTR_TYPE_SCALAR_MASK: {
26547 SDValue Src1 = Op.getOperand(1);
26548 SDValue Src2 = Op.getOperand(2);
26549 SDValue passThru = Op.getOperand(3);
26550 SDValue Mask = Op.getOperand(4);
26551 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26552 // There are 2 kinds of intrinsics in this group:
26553 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26554 // (2) With rounding mode and sae - 7 operands.
26555 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26556 if (Op.getNumOperands() == (5U + HasRounding)) {
26557 if (HasRounding) {
26558 SDValue Rnd = Op.getOperand(5);
26559 unsigned RC = 0;
26560 if (isRoundModeSAEToX(Rnd, RC))
26561 return getScalarMaskingNode(
26562 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26563 DAG.getTargetConstant(RC, dl, MVT::i32)),
26564 Mask, passThru, Subtarget, DAG);
26565 if (!isRoundModeCurDirection(Rnd))
26566 return SDValue();
26567 }
26568 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26569 Src2),
26570 Mask, passThru, Subtarget, DAG);
26571 }
26572
26573 assert(Op.getNumOperands() == (6U + HasRounding) &&
26574 "Unexpected intrinsic form");
26575 SDValue RoundingMode = Op.getOperand(5);
26576 unsigned Opc = IntrData->Opc0;
26577 if (HasRounding) {
26578 SDValue Sae = Op.getOperand(6);
26579 if (isRoundModeSAE(Sae))
26580 Opc = IntrWithRoundingModeOpcode;
26581 else if (!isRoundModeCurDirection(Sae))
26582 return SDValue();
26583 }
26584 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26585 Src2, RoundingMode),
26586 Mask, passThru, Subtarget, DAG);
26587 }
26589 SDValue Src1 = Op.getOperand(1);
26590 SDValue Src2 = Op.getOperand(2);
26591 SDValue passThru = Op.getOperand(3);
26592 SDValue Mask = Op.getOperand(4);
26593 SDValue Rnd = Op.getOperand(5);
26594
26595 SDValue NewOp;
26596 unsigned RC = 0;
26597 if (isRoundModeCurDirection(Rnd))
26598 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26599 else if (isRoundModeSAEToX(Rnd, RC))
26600 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26601 DAG.getTargetConstant(RC, dl, MVT::i32));
26602 else
26603 return SDValue();
26604
26605 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26606 }
26608 SDValue Src1 = Op.getOperand(1);
26609 SDValue Src2 = Op.getOperand(2);
26610 SDValue passThru = Op.getOperand(3);
26611 SDValue Mask = Op.getOperand(4);
26612 SDValue Sae = Op.getOperand(5);
26613 unsigned Opc;
26614 if (isRoundModeCurDirection(Sae))
26615 Opc = IntrData->Opc0;
26616 else if (isRoundModeSAE(Sae))
26617 Opc = IntrData->Opc1;
26618 else
26619 return SDValue();
26620
26621 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26622 Mask, passThru, Subtarget, DAG);
26623 }
26624 case INTR_TYPE_2OP_MASK: {
26625 SDValue Src1 = Op.getOperand(1);
26626 SDValue Src2 = Op.getOperand(2);
26627 SDValue PassThru = Op.getOperand(3);
26628 SDValue Mask = Op.getOperand(4);
26629 SDValue NewOp;
26630 if (IntrData->Opc1 != 0) {
26631 SDValue Rnd = Op.getOperand(5);
26632 unsigned RC = 0;
26633 if (isRoundModeSAEToX(Rnd, RC))
26634 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26635 DAG.getTargetConstant(RC, dl, MVT::i32));
26636 else if (!isRoundModeCurDirection(Rnd))
26637 return SDValue();
26638 }
26639 if (!NewOp)
26640 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26641 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26642 }
26644 SDValue Src1 = Op.getOperand(1);
26645 SDValue Src2 = Op.getOperand(2);
26646 SDValue PassThru = Op.getOperand(3);
26647 SDValue Mask = Op.getOperand(4);
26648
26649 unsigned Opc = IntrData->Opc0;
26650 if (IntrData->Opc1 != 0) {
26651 SDValue Sae = Op.getOperand(5);
26652 if (isRoundModeSAE(Sae))
26653 Opc = IntrData->Opc1;
26654 else if (!isRoundModeCurDirection(Sae))
26655 return SDValue();
26656 }
26657
26658 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26659 Mask, PassThru, Subtarget, DAG);
26660 }
26662 SDValue Src1 = Op.getOperand(1);
26663 SDValue Src2 = Op.getOperand(2);
26664 SDValue Src3 = Op.getOperand(3);
26665 SDValue PassThru = Op.getOperand(4);
26666 SDValue Mask = Op.getOperand(5);
26667 SDValue Sae = Op.getOperand(6);
26668 unsigned Opc;
26669 if (isRoundModeCurDirection(Sae))
26670 Opc = IntrData->Opc0;
26671 else if (isRoundModeSAE(Sae))
26672 Opc = IntrData->Opc1;
26673 else
26674 return SDValue();
26675
26676 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26677 Mask, PassThru, Subtarget, DAG);
26678 }
26680 SDValue Src1 = Op.getOperand(1);
26681 SDValue Src2 = Op.getOperand(2);
26682 SDValue Src3 = Op.getOperand(3);
26683 SDValue PassThru = Op.getOperand(4);
26684 SDValue Mask = Op.getOperand(5);
26685
26686 unsigned Opc = IntrData->Opc0;
26687 if (IntrData->Opc1 != 0) {
26688 SDValue Sae = Op.getOperand(6);
26689 if (isRoundModeSAE(Sae))
26690 Opc = IntrData->Opc1;
26691 else if (!isRoundModeCurDirection(Sae))
26692 return SDValue();
26693 }
26694 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26695 Mask, PassThru, Subtarget, DAG);
26696 }
26697 case BLENDV: {
26698 SDValue Src1 = Op.getOperand(1);
26699 SDValue Src2 = Op.getOperand(2);
26700 SDValue Src3 = Op.getOperand(3);
26701
26703 Src3 = DAG.getBitcast(MaskVT, Src3);
26704
26705 // Reverse the operands to match VSELECT order.
26706 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26707 }
26708 case VPERM_2OP : {
26709 SDValue Src1 = Op.getOperand(1);
26710 SDValue Src2 = Op.getOperand(2);
26711
26712 // Swap Src1 and Src2 in the node creation
26713 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26714 }
26715 case CFMA_OP_MASKZ:
26716 case CFMA_OP_MASK: {
26717 SDValue Src1 = Op.getOperand(1);
26718 SDValue Src2 = Op.getOperand(2);
26719 SDValue Src3 = Op.getOperand(3);
26720 SDValue Mask = Op.getOperand(4);
26721 MVT VT = Op.getSimpleValueType();
26722
26723 SDValue PassThru = Src3;
26724 if (IntrData->Type == CFMA_OP_MASKZ)
26725 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26726
26727 // We add rounding mode to the Node when
26728 // - RC Opcode is specified and
26729 // - RC is not "current direction".
26730 SDValue NewOp;
26731 if (IntrData->Opc1 != 0) {
26732 SDValue Rnd = Op.getOperand(5);
26733 unsigned RC = 0;
26734 if (isRoundModeSAEToX(Rnd, RC))
26735 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26736 DAG.getTargetConstant(RC, dl, MVT::i32));
26737 else if (!isRoundModeCurDirection(Rnd))
26738 return SDValue();
26739 }
26740 if (!NewOp)
26741 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26742 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26743 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26744 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26745 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26746 }
26747 case IFMA_OP:
26748 // NOTE: We need to swizzle the operands to pass the multiply operands
26749 // first.
26750 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26751 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26752 case FPCLASSS: {
26753 SDValue Src1 = Op.getOperand(1);
26754 SDValue Imm = Op.getOperand(2);
26755 SDValue Mask = Op.getOperand(3);
26756 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26757 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26758 Subtarget, DAG);
26759 // Need to fill with zeros to ensure the bitcast will produce zeroes
26760 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26761 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26762 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26763 DAG.getVectorIdxConstant(0, dl));
26764 return DAG.getBitcast(MVT::i8, Ins);
26765 }
26766
26767 case CMP_MASK_CC: {
26768 MVT MaskVT = Op.getSimpleValueType();
26769 SDValue CC = Op.getOperand(3);
26770 SDValue Mask = Op.getOperand(4);
26771 // We specify 2 possible opcodes for intrinsics with rounding modes.
26772 // First, we check if the intrinsic may have non-default rounding mode,
26773 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26774 if (IntrData->Opc1 != 0) {
26775 SDValue Sae = Op.getOperand(5);
26776 if (isRoundModeSAE(Sae))
26777 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26778 Op.getOperand(2), CC, Mask, Sae);
26779 if (!isRoundModeCurDirection(Sae))
26780 return SDValue();
26781 }
26782 //default rounding mode
26783 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26784 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26785 }
26786 case CMP_MASK_SCALAR_CC: {
26787 SDValue Src1 = Op.getOperand(1);
26788 SDValue Src2 = Op.getOperand(2);
26789 SDValue CC = Op.getOperand(3);
26790 SDValue Mask = Op.getOperand(4);
26791
26792 SDValue Cmp;
26793 if (IntrData->Opc1 != 0) {
26794 SDValue Sae = Op.getOperand(5);
26795 if (isRoundModeSAE(Sae))
26796 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26797 else if (!isRoundModeCurDirection(Sae))
26798 return SDValue();
26799 }
26800 //default rounding mode
26801 if (!Cmp.getNode())
26802 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26803
26804 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26805 Subtarget, DAG);
26806 // Need to fill with zeros to ensure the bitcast will produce zeroes
26807 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26808 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26809 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26810 DAG.getVectorIdxConstant(0, dl));
26811 return DAG.getBitcast(MVT::i8, Ins);
26812 }
26813 case COMI: { // Comparison intrinsics
26814 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26815 SDValue LHS = Op.getOperand(1);
26816 SDValue RHS = Op.getOperand(2);
26817 // Some conditions require the operands to be swapped.
26818 if (CC == ISD::SETLT || CC == ISD::SETLE)
26819 std::swap(LHS, RHS);
26820
26821 // For AVX10.2, Support EQ and NE.
26822 bool HasAVX10_2_COMX =
26823 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26824
26825 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26826 // For BF type we need to fall back.
26827 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26828
26829 auto ComiOpCode = IntrData->Opc0;
26830 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26831
26832 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26833 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26834
26835 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26836
26837 SDValue SetCC;
26838 switch (CC) {
26839 case ISD::SETEQ: {
26840 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26841 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26842 break;
26843 // (ZF = 1 and PF = 0)
26844 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26845 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26846 break;
26847 }
26848 case ISD::SETNE: {
26849 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26850 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26851 break;
26852 // (ZF = 0 or PF = 1)
26853 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26854 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26855 break;
26856 }
26857 case ISD::SETGT: // (CF = 0 and ZF = 0)
26858 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26859 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26860 break;
26861 }
26862 case ISD::SETGE: // CF = 0
26863 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26864 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26865 break;
26866 default:
26867 llvm_unreachable("Unexpected illegal condition!");
26868 }
26869 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26870 }
26871 case COMI_RM: { // Comparison intrinsics with Sae
26872 SDValue LHS = Op.getOperand(1);
26873 SDValue RHS = Op.getOperand(2);
26874 unsigned CondVal = Op.getConstantOperandVal(3);
26875 SDValue Sae = Op.getOperand(4);
26876
26877 SDValue FCmp;
26878 if (isRoundModeCurDirection(Sae))
26879 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26880 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26881 else if (isRoundModeSAE(Sae))
26882 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26883 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26884 else
26885 return SDValue();
26886 // Need to fill with zeros to ensure the bitcast will produce zeroes
26887 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26888 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26889 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26890 DAG.getVectorIdxConstant(0, dl));
26891 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26892 DAG.getBitcast(MVT::i16, Ins));
26893 }
26894 case VSHIFT: {
26895 SDValue SrcOp = Op.getOperand(1);
26896 SDValue ShAmt = Op.getOperand(2);
26897 assert(ShAmt.getValueType() == MVT::i32 &&
26898 "Unexpected VSHIFT amount type");
26899
26900 // Catch shift-by-constant.
26901 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26902 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26903 Op.getSimpleValueType(), SrcOp,
26904 CShAmt->getZExtValue(), DAG);
26905
26906 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26907 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26908 SrcOp, ShAmt, 0, Subtarget, DAG);
26909 }
26911 SDValue Mask = Op.getOperand(3);
26912 SDValue DataToCompress = Op.getOperand(1);
26913 SDValue PassThru = Op.getOperand(2);
26914 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26915 return Op.getOperand(1);
26916
26917 // Avoid false dependency.
26918 if (PassThru.isUndef())
26919 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26920
26921 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26922 Mask);
26923 }
26924 case FIXUPIMM:
26925 case FIXUPIMM_MASKZ: {
26926 SDValue Src1 = Op.getOperand(1);
26927 SDValue Src2 = Op.getOperand(2);
26928 SDValue Src3 = Op.getOperand(3);
26929 SDValue Imm = Op.getOperand(4);
26930 SDValue Mask = Op.getOperand(5);
26931 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26932 ? Src1
26933 : getZeroVector(VT, Subtarget, DAG, dl);
26934
26935 unsigned Opc = IntrData->Opc0;
26936 if (IntrData->Opc1 != 0) {
26937 SDValue Sae = Op.getOperand(6);
26938 if (isRoundModeSAE(Sae))
26939 Opc = IntrData->Opc1;
26940 else if (!isRoundModeCurDirection(Sae))
26941 return SDValue();
26942 }
26943
26944 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26945
26947 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26948
26949 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26950 }
26951 case ROUNDP: {
26952 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26953 // Clear the upper bits of the rounding immediate so that the legacy
26954 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26955 uint64_t Round = Op.getConstantOperandVal(2);
26956 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26957 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26958 Op.getOperand(1), RoundingMode);
26959 }
26960 case ROUNDS: {
26961 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26962 // Clear the upper bits of the rounding immediate so that the legacy
26963 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26964 uint64_t Round = Op.getConstantOperandVal(3);
26965 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26966 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26967 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26968 }
26969 case BEXTRI: {
26970 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26971
26972 uint64_t Imm = Op.getConstantOperandVal(2);
26973 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26974 Op.getValueType());
26975 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26976 Op.getOperand(1), Control);
26977 }
26978 // ADC/SBB
26979 case ADX: {
26980 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26981 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26982
26983 SDValue Res;
26984 // If the carry in is zero, then we should just use ADD/SUB instead of
26985 // ADC/SBB.
26986 if (isNullConstant(Op.getOperand(1))) {
26987 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26988 Op.getOperand(3));
26989 } else {
26990 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26991 DAG.getAllOnesConstant(dl, MVT::i8));
26992 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26993 Op.getOperand(3), GenCF.getValue(1));
26994 }
26995 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26996 SDValue Results[] = { SetCC, Res };
26997 return DAG.getMergeValues(Results, dl);
26998 }
26999 case CVTPD2PS_MASK:
27000 case CVTPD2DQ_MASK:
27001 case CVTQQ2PS_MASK:
27002 case TRUNCATE_TO_REG: {
27003 SDValue Src = Op.getOperand(1);
27004 SDValue PassThru = Op.getOperand(2);
27005 SDValue Mask = Op.getOperand(3);
27006
27007 if (isAllOnesConstant(Mask))
27008 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27009
27010 MVT SrcVT = Src.getSimpleValueType();
27011 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27012 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27013 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27014 {Src, PassThru, Mask});
27015 }
27016 case TRUNCATE2_TO_REG: {
27017 SDValue Src = Op.getOperand(1);
27018 SDValue Src2 = Op.getOperand(2);
27019 SDValue PassThru = Op.getOperand(3);
27020 SDValue Mask = Op.getOperand(4);
27021
27022 if (isAllOnesConstant(Mask))
27023 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27024
27025 MVT Src2VT = Src2.getSimpleValueType();
27026 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27027 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27028 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27029 {Src, Src2, PassThru, Mask});
27030 }
27031 case CVTPS2PH_MASK: {
27032 SDValue Src = Op.getOperand(1);
27033 SDValue Rnd = Op.getOperand(2);
27034 SDValue PassThru = Op.getOperand(3);
27035 SDValue Mask = Op.getOperand(4);
27036
27037 unsigned RC = 0;
27038 unsigned Opc = IntrData->Opc0;
27039 bool SAE = Src.getValueType().is512BitVector() &&
27040 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27041 if (SAE) {
27043 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27044 }
27045
27046 if (isAllOnesConstant(Mask))
27047 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27048
27049 if (SAE)
27051 else
27052 Opc = IntrData->Opc1;
27053 MVT SrcVT = Src.getSimpleValueType();
27054 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27055 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27056 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27057 }
27058 case CVTNEPS2BF16_MASK: {
27059 SDValue Src = Op.getOperand(1);
27060 SDValue PassThru = Op.getOperand(2);
27061 SDValue Mask = Op.getOperand(3);
27062
27063 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27064 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27065
27066 // Break false dependency.
27067 if (PassThru.isUndef())
27068 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27069
27070 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27071 Mask);
27072 }
27073 default:
27074 break;
27075 }
27076 }
27077
27078 switch (IntNo) {
27079 default: return SDValue(); // Don't custom lower most intrinsics.
27080
27081 // ptest and testp intrinsics. The intrinsic these come from are designed to
27082 // return an integer value, not just an instruction so lower it to the ptest
27083 // or testp pattern and a setcc for the result.
27084 case Intrinsic::x86_avx512_ktestc_b:
27085 case Intrinsic::x86_avx512_ktestc_w:
27086 case Intrinsic::x86_avx512_ktestc_d:
27087 case Intrinsic::x86_avx512_ktestc_q:
27088 case Intrinsic::x86_avx512_ktestz_b:
27089 case Intrinsic::x86_avx512_ktestz_w:
27090 case Intrinsic::x86_avx512_ktestz_d:
27091 case Intrinsic::x86_avx512_ktestz_q:
27092 case Intrinsic::x86_sse41_ptestz:
27093 case Intrinsic::x86_sse41_ptestc:
27094 case Intrinsic::x86_sse41_ptestnzc:
27095 case Intrinsic::x86_avx_ptestz_256:
27096 case Intrinsic::x86_avx_ptestc_256:
27097 case Intrinsic::x86_avx_ptestnzc_256:
27098 case Intrinsic::x86_avx_vtestz_ps:
27099 case Intrinsic::x86_avx_vtestc_ps:
27100 case Intrinsic::x86_avx_vtestnzc_ps:
27101 case Intrinsic::x86_avx_vtestz_pd:
27102 case Intrinsic::x86_avx_vtestc_pd:
27103 case Intrinsic::x86_avx_vtestnzc_pd:
27104 case Intrinsic::x86_avx_vtestz_ps_256:
27105 case Intrinsic::x86_avx_vtestc_ps_256:
27106 case Intrinsic::x86_avx_vtestnzc_ps_256:
27107 case Intrinsic::x86_avx_vtestz_pd_256:
27108 case Intrinsic::x86_avx_vtestc_pd_256:
27109 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27110 unsigned TestOpc = X86ISD::PTEST;
27111 X86::CondCode X86CC;
27112 switch (IntNo) {
27113 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27114 case Intrinsic::x86_avx512_ktestc_b:
27115 case Intrinsic::x86_avx512_ktestc_w:
27116 case Intrinsic::x86_avx512_ktestc_d:
27117 case Intrinsic::x86_avx512_ktestc_q:
27118 // CF = 1
27119 TestOpc = X86ISD::KTEST;
27120 X86CC = X86::COND_B;
27121 break;
27122 case Intrinsic::x86_avx512_ktestz_b:
27123 case Intrinsic::x86_avx512_ktestz_w:
27124 case Intrinsic::x86_avx512_ktestz_d:
27125 case Intrinsic::x86_avx512_ktestz_q:
27126 TestOpc = X86ISD::KTEST;
27127 X86CC = X86::COND_E;
27128 break;
27129 case Intrinsic::x86_avx_vtestz_ps:
27130 case Intrinsic::x86_avx_vtestz_pd:
27131 case Intrinsic::x86_avx_vtestz_ps_256:
27132 case Intrinsic::x86_avx_vtestz_pd_256:
27133 TestOpc = X86ISD::TESTP;
27134 [[fallthrough]];
27135 case Intrinsic::x86_sse41_ptestz:
27136 case Intrinsic::x86_avx_ptestz_256:
27137 // ZF = 1
27138 X86CC = X86::COND_E;
27139 break;
27140 case Intrinsic::x86_avx_vtestc_ps:
27141 case Intrinsic::x86_avx_vtestc_pd:
27142 case Intrinsic::x86_avx_vtestc_ps_256:
27143 case Intrinsic::x86_avx_vtestc_pd_256:
27144 TestOpc = X86ISD::TESTP;
27145 [[fallthrough]];
27146 case Intrinsic::x86_sse41_ptestc:
27147 case Intrinsic::x86_avx_ptestc_256:
27148 // CF = 1
27149 X86CC = X86::COND_B;
27150 break;
27151 case Intrinsic::x86_avx_vtestnzc_ps:
27152 case Intrinsic::x86_avx_vtestnzc_pd:
27153 case Intrinsic::x86_avx_vtestnzc_ps_256:
27154 case Intrinsic::x86_avx_vtestnzc_pd_256:
27155 TestOpc = X86ISD::TESTP;
27156 [[fallthrough]];
27157 case Intrinsic::x86_sse41_ptestnzc:
27158 case Intrinsic::x86_avx_ptestnzc_256:
27159 // ZF and CF = 0
27160 X86CC = X86::COND_A;
27161 break;
27162 }
27163
27164 SDValue LHS = Op.getOperand(1);
27165 SDValue RHS = Op.getOperand(2);
27166 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27167 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27168 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27169 }
27170
27171 case Intrinsic::x86_sse42_pcmpistria128:
27172 case Intrinsic::x86_sse42_pcmpestria128:
27173 case Intrinsic::x86_sse42_pcmpistric128:
27174 case Intrinsic::x86_sse42_pcmpestric128:
27175 case Intrinsic::x86_sse42_pcmpistrio128:
27176 case Intrinsic::x86_sse42_pcmpestrio128:
27177 case Intrinsic::x86_sse42_pcmpistris128:
27178 case Intrinsic::x86_sse42_pcmpestris128:
27179 case Intrinsic::x86_sse42_pcmpistriz128:
27180 case Intrinsic::x86_sse42_pcmpestriz128: {
27181 unsigned Opcode;
27182 X86::CondCode X86CC;
27183 switch (IntNo) {
27184 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27185 case Intrinsic::x86_sse42_pcmpistria128:
27186 Opcode = X86ISD::PCMPISTR;
27187 X86CC = X86::COND_A;
27188 break;
27189 case Intrinsic::x86_sse42_pcmpestria128:
27190 Opcode = X86ISD::PCMPESTR;
27191 X86CC = X86::COND_A;
27192 break;
27193 case Intrinsic::x86_sse42_pcmpistric128:
27194 Opcode = X86ISD::PCMPISTR;
27195 X86CC = X86::COND_B;
27196 break;
27197 case Intrinsic::x86_sse42_pcmpestric128:
27198 Opcode = X86ISD::PCMPESTR;
27199 X86CC = X86::COND_B;
27200 break;
27201 case Intrinsic::x86_sse42_pcmpistrio128:
27202 Opcode = X86ISD::PCMPISTR;
27203 X86CC = X86::COND_O;
27204 break;
27205 case Intrinsic::x86_sse42_pcmpestrio128:
27206 Opcode = X86ISD::PCMPESTR;
27207 X86CC = X86::COND_O;
27208 break;
27209 case Intrinsic::x86_sse42_pcmpistris128:
27210 Opcode = X86ISD::PCMPISTR;
27211 X86CC = X86::COND_S;
27212 break;
27213 case Intrinsic::x86_sse42_pcmpestris128:
27214 Opcode = X86ISD::PCMPESTR;
27215 X86CC = X86::COND_S;
27216 break;
27217 case Intrinsic::x86_sse42_pcmpistriz128:
27218 Opcode = X86ISD::PCMPISTR;
27219 X86CC = X86::COND_E;
27220 break;
27221 case Intrinsic::x86_sse42_pcmpestriz128:
27222 Opcode = X86ISD::PCMPESTR;
27223 X86CC = X86::COND_E;
27224 break;
27225 }
27227 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27228 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27229 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27230 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27231 }
27232
27233 case Intrinsic::x86_sse42_pcmpistri128:
27234 case Intrinsic::x86_sse42_pcmpestri128: {
27235 unsigned Opcode;
27236 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27237 Opcode = X86ISD::PCMPISTR;
27238 else
27239 Opcode = X86ISD::PCMPESTR;
27240
27242 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27243 return DAG.getNode(Opcode, dl, VTs, NewOps);
27244 }
27245
27246 case Intrinsic::x86_sse42_pcmpistrm128:
27247 case Intrinsic::x86_sse42_pcmpestrm128: {
27248 unsigned Opcode;
27249 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27250 Opcode = X86ISD::PCMPISTR;
27251 else
27252 Opcode = X86ISD::PCMPESTR;
27253
27255 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27256 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27257 }
27258
27259 case Intrinsic::eh_sjlj_lsda: {
27261 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27262 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27263 auto &Context = MF.getContext();
27264 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27265 Twine(MF.getFunctionNumber()));
27266 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27267 DAG.getMCSymbol(S, PtrVT));
27268 }
27269
27270 case Intrinsic::x86_seh_lsda: {
27271 // Compute the symbol for the LSDA. We know it'll get emitted later.
27273 SDValue Op1 = Op.getOperand(1);
27274 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27277
27278 // Generate a simple absolute symbol reference. This intrinsic is only
27279 // supported on 32-bit Windows, which isn't PIC.
27280 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27281 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27282 }
27283
27284 case Intrinsic::eh_recoverfp: {
27285 SDValue FnOp = Op.getOperand(1);
27286 SDValue IncomingFPOp = Op.getOperand(2);
27287 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27288 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27289 if (!Fn)
27291 "llvm.eh.recoverfp must take a function as the first argument");
27292 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27293 }
27294
27295 case Intrinsic::localaddress: {
27296 // Returns one of the stack, base, or frame pointer registers, depending on
27297 // which is used to reference local variables.
27299 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27300 Register Reg;
27301 if (RegInfo->hasBasePointer(MF))
27302 Reg = RegInfo->getBaseRegister();
27303 else { // Handles the SP or FP case.
27304 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27305 if (CantUseFP)
27306 Reg = RegInfo->getPtrSizedStackRegister(MF);
27307 else
27308 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27309 }
27310 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27311 }
27312 case Intrinsic::x86_avx512_vp2intersect_q_512:
27313 case Intrinsic::x86_avx512_vp2intersect_q_256:
27314 case Intrinsic::x86_avx512_vp2intersect_q_128:
27315 case Intrinsic::x86_avx512_vp2intersect_d_512:
27316 case Intrinsic::x86_avx512_vp2intersect_d_256:
27317 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27318 SDLoc DL(Op);
27319 MVT MaskVT = Op.getSimpleValueType();
27320 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27322 Op.getOperand(1), Op.getOperand(2));
27323 SDValue Result0 =
27324 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27325 SDValue Result1 =
27326 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27327 return DAG.getMergeValues({Result0, Result1}, DL);
27328 }
27329 case Intrinsic::x86_mmx_pslli_w:
27330 case Intrinsic::x86_mmx_pslli_d:
27331 case Intrinsic::x86_mmx_pslli_q:
27332 case Intrinsic::x86_mmx_psrli_w:
27333 case Intrinsic::x86_mmx_psrli_d:
27334 case Intrinsic::x86_mmx_psrli_q:
27335 case Intrinsic::x86_mmx_psrai_w:
27336 case Intrinsic::x86_mmx_psrai_d: {
27337 SDLoc DL(Op);
27338 SDValue ShAmt = Op.getOperand(2);
27339 // If the argument is a constant, convert it to a target constant.
27340 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27341 // Clamp out of bounds shift amounts since they will otherwise be masked
27342 // to 8-bits which may make it no longer out of bounds.
27343 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27344 if (ShiftAmount == 0)
27345 return Op.getOperand(1);
27346
27347 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27348 Op.getOperand(0), Op.getOperand(1),
27349 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27350 }
27351
27352 unsigned NewIntrinsic;
27353 switch (IntNo) {
27354 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27355 case Intrinsic::x86_mmx_pslli_w:
27356 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27357 break;
27358 case Intrinsic::x86_mmx_pslli_d:
27359 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27360 break;
27361 case Intrinsic::x86_mmx_pslli_q:
27362 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27363 break;
27364 case Intrinsic::x86_mmx_psrli_w:
27365 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27366 break;
27367 case Intrinsic::x86_mmx_psrli_d:
27368 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27369 break;
27370 case Intrinsic::x86_mmx_psrli_q:
27371 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27372 break;
27373 case Intrinsic::x86_mmx_psrai_w:
27374 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27375 break;
27376 case Intrinsic::x86_mmx_psrai_d:
27377 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27378 break;
27379 }
27380
27381 // The vector shift intrinsics with scalars uses 32b shift amounts but
27382 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27383 // MMX register.
27384 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27385 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27386 DAG.getTargetConstant(NewIntrinsic, DL,
27388 Op.getOperand(1), ShAmt);
27389 }
27390 case Intrinsic::thread_pointer: {
27391 if (Subtarget.isTargetELF()) {
27392 SDLoc dl(Op);
27393 EVT PtrVT = Op.getValueType();
27394 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27396 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27397 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27399 }
27401 "Target OS doesn't support __builtin_thread_pointer() yet.");
27402 }
27403 }
27404}
27405
27407 SDValue Src, SDValue Mask, SDValue Base,
27408 SDValue Index, SDValue ScaleOp, SDValue Chain,
27409 const X86Subtarget &Subtarget) {
27410 SDLoc dl(Op);
27411 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27412 // Scale must be constant.
27413 if (!C)
27414 return SDValue();
27415 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27416 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27417 TLI.getPointerTy(DAG.getDataLayout()));
27418 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27419 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27420 // If source is undef or we know it won't be used, use a zero vector
27421 // to break register dependency.
27422 // TODO: use undef instead and let BreakFalseDeps deal with it?
27423 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27424 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27425
27426 // Cast mask to an integer type.
27427 Mask = DAG.getBitcast(MaskVT, Mask);
27428
27429 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27430
27431 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27432 SDValue Res =
27433 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27434 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27435 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27436}
27437
27439 SDValue Src, SDValue Mask, SDValue Base,
27440 SDValue Index, SDValue ScaleOp, SDValue Chain,
27441 const X86Subtarget &Subtarget) {
27442 MVT VT = Op.getSimpleValueType();
27443 SDLoc dl(Op);
27444 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27445 // Scale must be constant.
27446 if (!C)
27447 return SDValue();
27448 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27449 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27450 TLI.getPointerTy(DAG.getDataLayout()));
27451 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27453 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27454
27455 // We support two versions of the gather intrinsics. One with scalar mask and
27456 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27457 if (Mask.getValueType() != MaskVT)
27458 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27459
27460 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27461 // If source is undef or we know it won't be used, use a zero vector
27462 // to break register dependency.
27463 // TODO: use undef instead and let BreakFalseDeps deal with it?
27464 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27465 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27466
27467 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27468
27469 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27470 SDValue Res =
27471 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27472 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27473 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27474}
27475
27477 SDValue Src, SDValue Mask, SDValue Base,
27478 SDValue Index, SDValue ScaleOp, SDValue Chain,
27479 const X86Subtarget &Subtarget) {
27480 SDLoc dl(Op);
27481 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27482 // Scale must be constant.
27483 if (!C)
27484 return SDValue();
27485 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27486 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27487 TLI.getPointerTy(DAG.getDataLayout()));
27488 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27489 Src.getSimpleValueType().getVectorNumElements());
27490 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27491
27492 // We support two versions of the scatter intrinsics. One with scalar mask and
27493 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27494 if (Mask.getValueType() != MaskVT)
27495 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27496
27497 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27498
27499 SDVTList VTs = DAG.getVTList(MVT::Other);
27500 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27501 SDValue Res =
27502 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27503 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27504 return Res;
27505}
27506
27508 SDValue Mask, SDValue Base, SDValue Index,
27509 SDValue ScaleOp, SDValue Chain,
27510 const X86Subtarget &Subtarget) {
27511 SDLoc dl(Op);
27512 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27513 // Scale must be constant.
27514 if (!C)
27515 return SDValue();
27516 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27517 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27518 TLI.getPointerTy(DAG.getDataLayout()));
27519 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27520 SDValue Segment = DAG.getRegister(0, MVT::i32);
27521 MVT MaskVT =
27522 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27523 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27524 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27525 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27526 return SDValue(Res, 0);
27527}
27528
27529/// Handles the lowering of builtin intrinsics with chain that return their
27530/// value into registers EDX:EAX.
27531/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27532/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27533/// TargetOpcode.
27534/// Returns a Glue value which can be used to add extra copy-from-reg if the
27535/// expanded intrinsics implicitly defines extra registers (i.e. not just
27536/// EDX:EAX).
27538 SelectionDAG &DAG,
27539 unsigned TargetOpcode,
27540 unsigned SrcReg,
27541 const X86Subtarget &Subtarget,
27543 SDValue Chain = N->getOperand(0);
27544 SDValue Glue;
27545
27546 if (SrcReg) {
27547 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27548 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27549 Glue = Chain.getValue(1);
27550 }
27551
27552 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27553 SDValue N1Ops[] = {Chain, Glue};
27554 SDNode *N1 = DAG.getMachineNode(
27555 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27556 Chain = SDValue(N1, 0);
27557
27558 // Reads the content of XCR and returns it in registers EDX:EAX.
27559 SDValue LO, HI;
27560 if (Subtarget.is64Bit()) {
27561 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27562 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27563 LO.getValue(2));
27564 } else {
27565 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27566 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27567 LO.getValue(2));
27568 }
27569 Chain = HI.getValue(1);
27570 Glue = HI.getValue(2);
27571
27572 if (Subtarget.is64Bit()) {
27573 // Merge the two 32-bit values into a 64-bit one.
27574 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27575 DAG.getConstant(32, DL, MVT::i8));
27576 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27577 Results.push_back(Chain);
27578 return Glue;
27579 }
27580
27581 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27582 SDValue Ops[] = { LO, HI };
27583 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27584 Results.push_back(Pair);
27585 Results.push_back(Chain);
27586 return Glue;
27587}
27588
27589/// Handles the lowering of builtin intrinsics that read the time stamp counter
27590/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27591/// READCYCLECOUNTER nodes.
27592static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27593 SelectionDAG &DAG,
27594 const X86Subtarget &Subtarget,
27596 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27597 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27598 // and the EAX register is loaded with the low-order 32 bits.
27599 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27600 /* NoRegister */0, Subtarget,
27601 Results);
27602 if (Opcode != X86::RDTSCP)
27603 return;
27604
27605 SDValue Chain = Results[1];
27606 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27607 // the ECX register. Add 'ecx' explicitly to the chain.
27608 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27609 Results[1] = ecx;
27610 Results.push_back(ecx.getValue(1));
27611}
27612
27614 SelectionDAG &DAG) {
27616 SDLoc DL(Op);
27617 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27618 Results);
27619 return DAG.getMergeValues(Results, DL);
27620}
27621
27624 SDValue Chain = Op.getOperand(0);
27625 SDValue RegNode = Op.getOperand(2);
27626 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27627 if (!EHInfo)
27628 report_fatal_error("EH registrations only live in functions using WinEH");
27629
27630 // Cast the operand to an alloca, and remember the frame index.
27631 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27632 if (!FINode)
27633 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27634 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27635
27636 // Return the chain operand without making any DAG nodes.
27637 return Chain;
27638}
27639
27642 SDValue Chain = Op.getOperand(0);
27643 SDValue EHGuard = Op.getOperand(2);
27644 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27645 if (!EHInfo)
27646 report_fatal_error("EHGuard only live in functions using WinEH");
27647
27648 // Cast the operand to an alloca, and remember the frame index.
27649 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27650 if (!FINode)
27651 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27652 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27653
27654 // Return the chain operand without making any DAG nodes.
27655 return Chain;
27656}
27657
27658/// Emit Truncating Store with signed or unsigned saturation.
27659static SDValue
27660EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27661 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27662 SelectionDAG &DAG) {
27663 SDVTList VTs = DAG.getVTList(MVT::Other);
27664 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27665 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27666 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27667 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27668}
27669
27670/// Emit Masked Truncating Store with signed or unsigned saturation.
27671static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27672 const SDLoc &DL,
27673 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27674 MachineMemOperand *MMO, SelectionDAG &DAG) {
27675 SDVTList VTs = DAG.getVTList(MVT::Other);
27676 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27677 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27678 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27679}
27680
27682 const MachineFunction &MF) {
27683 if (!Subtarget.is64Bit())
27684 return false;
27685 // 64-bit targets support extended Swift async frame setup,
27686 // except for targets that use the windows 64 prologue.
27687 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27688}
27689
27691 SelectionDAG &DAG) {
27692 unsigned IntNo = Op.getConstantOperandVal(1);
27693 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27694 if (!IntrData) {
27695 switch (IntNo) {
27696
27697 case Intrinsic::swift_async_context_addr: {
27698 SDLoc dl(Op);
27699 auto &MF = DAG.getMachineFunction();
27700 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27701 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27703 X86FI->setHasSwiftAsyncContext(true);
27704 SDValue Chain = Op->getOperand(0);
27705 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27706 SDValue Result =
27707 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27708 DAG.getTargetConstant(8, dl, MVT::i32)),
27709 0);
27710 // Return { result, chain }.
27711 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27712 CopyRBP.getValue(1));
27713 } else {
27714 // No special extended frame, create or reuse an existing stack slot.
27715 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27716 if (!X86FI->getSwiftAsyncContextFrameIdx())
27717 X86FI->setSwiftAsyncContextFrameIdx(
27718 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27719 false));
27720 SDValue Result =
27721 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27722 PtrSize == 8 ? MVT::i64 : MVT::i32);
27723 // Return { result, chain }.
27724 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27725 Op->getOperand(0));
27726 }
27727 }
27728
27729 case llvm::Intrinsic::x86_seh_ehregnode:
27730 return MarkEHRegistrationNode(Op, DAG);
27731 case llvm::Intrinsic::x86_seh_ehguard:
27732 return MarkEHGuard(Op, DAG);
27733 case llvm::Intrinsic::x86_rdpkru: {
27734 SDLoc dl(Op);
27735 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27736 // Create a RDPKRU node and pass 0 to the ECX parameter.
27737 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27738 DAG.getConstant(0, dl, MVT::i32));
27739 }
27740 case llvm::Intrinsic::x86_wrpkru: {
27741 SDLoc dl(Op);
27742 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27743 // to the EDX and ECX parameters.
27744 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27745 Op.getOperand(0), Op.getOperand(2),
27746 DAG.getConstant(0, dl, MVT::i32),
27747 DAG.getConstant(0, dl, MVT::i32));
27748 }
27749 case llvm::Intrinsic::asan_check_memaccess: {
27750 // Mark this as adjustsStack because it will be lowered to a call.
27752 // Don't do anything here, we will expand these intrinsics out later.
27753 return Op;
27754 }
27755 case llvm::Intrinsic::x86_flags_read_u32:
27756 case llvm::Intrinsic::x86_flags_read_u64:
27757 case llvm::Intrinsic::x86_flags_write_u32:
27758 case llvm::Intrinsic::x86_flags_write_u64: {
27759 // We need a frame pointer because this will get lowered to a PUSH/POP
27760 // sequence.
27763 // Don't do anything here, we will expand these intrinsics out later
27764 // during FinalizeISel in EmitInstrWithCustomInserter.
27765 return Op;
27766 }
27767 case Intrinsic::x86_lwpins32:
27768 case Intrinsic::x86_lwpins64:
27769 case Intrinsic::x86_umwait:
27770 case Intrinsic::x86_tpause: {
27771 SDLoc dl(Op);
27772 SDValue Chain = Op->getOperand(0);
27773 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27774 unsigned Opcode;
27775
27776 switch (IntNo) {
27777 default: llvm_unreachable("Impossible intrinsic");
27778 case Intrinsic::x86_umwait:
27779 Opcode = X86ISD::UMWAIT;
27780 break;
27781 case Intrinsic::x86_tpause:
27782 Opcode = X86ISD::TPAUSE;
27783 break;
27784 case Intrinsic::x86_lwpins32:
27785 case Intrinsic::x86_lwpins64:
27786 Opcode = X86ISD::LWPINS;
27787 break;
27788 }
27789
27791 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27792 Op->getOperand(3), Op->getOperand(4));
27793 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27794 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27795 Operation.getValue(1));
27796 }
27797 case Intrinsic::x86_enqcmd:
27798 case Intrinsic::x86_enqcmds: {
27799 SDLoc dl(Op);
27800 SDValue Chain = Op.getOperand(0);
27801 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27802 unsigned Opcode;
27803 switch (IntNo) {
27804 default: llvm_unreachable("Impossible intrinsic!");
27805 case Intrinsic::x86_enqcmd:
27806 Opcode = X86ISD::ENQCMD;
27807 break;
27808 case Intrinsic::x86_enqcmds:
27809 Opcode = X86ISD::ENQCMDS;
27810 break;
27811 }
27812 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27813 Op.getOperand(3));
27814 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27815 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27816 Operation.getValue(1));
27817 }
27818 case Intrinsic::x86_aesenc128kl:
27819 case Intrinsic::x86_aesdec128kl:
27820 case Intrinsic::x86_aesenc256kl:
27821 case Intrinsic::x86_aesdec256kl: {
27822 SDLoc DL(Op);
27823 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27824 SDValue Chain = Op.getOperand(0);
27825 unsigned Opcode;
27826
27827 switch (IntNo) {
27828 default: llvm_unreachable("Impossible intrinsic");
27829 case Intrinsic::x86_aesenc128kl:
27830 Opcode = X86ISD::AESENC128KL;
27831 break;
27832 case Intrinsic::x86_aesdec128kl:
27833 Opcode = X86ISD::AESDEC128KL;
27834 break;
27835 case Intrinsic::x86_aesenc256kl:
27836 Opcode = X86ISD::AESENC256KL;
27837 break;
27838 case Intrinsic::x86_aesdec256kl:
27839 Opcode = X86ISD::AESDEC256KL;
27840 break;
27841 }
27842
27843 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27844 MachineMemOperand *MMO = MemIntr->getMemOperand();
27845 EVT MemVT = MemIntr->getMemoryVT();
27847 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27848 MMO);
27849 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27850
27851 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27852 {ZF, Operation.getValue(0), Operation.getValue(2)});
27853 }
27854 case Intrinsic::x86_aesencwide128kl:
27855 case Intrinsic::x86_aesdecwide128kl:
27856 case Intrinsic::x86_aesencwide256kl:
27857 case Intrinsic::x86_aesdecwide256kl: {
27858 SDLoc DL(Op);
27859 SDVTList VTs = DAG.getVTList(
27860 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27861 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27862 SDValue Chain = Op.getOperand(0);
27863 unsigned Opcode;
27864
27865 switch (IntNo) {
27866 default: llvm_unreachable("Impossible intrinsic");
27867 case Intrinsic::x86_aesencwide128kl:
27868 Opcode = X86ISD::AESENCWIDE128KL;
27869 break;
27870 case Intrinsic::x86_aesdecwide128kl:
27871 Opcode = X86ISD::AESDECWIDE128KL;
27872 break;
27873 case Intrinsic::x86_aesencwide256kl:
27874 Opcode = X86ISD::AESENCWIDE256KL;
27875 break;
27876 case Intrinsic::x86_aesdecwide256kl:
27877 Opcode = X86ISD::AESDECWIDE256KL;
27878 break;
27879 }
27880
27881 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27882 MachineMemOperand *MMO = MemIntr->getMemOperand();
27883 EVT MemVT = MemIntr->getMemoryVT();
27885 Opcode, DL, VTs,
27886 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27887 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27888 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27889 MemVT, MMO);
27890 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27891
27892 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27893 {ZF, Operation.getValue(1), Operation.getValue(2),
27894 Operation.getValue(3), Operation.getValue(4),
27895 Operation.getValue(5), Operation.getValue(6),
27896 Operation.getValue(7), Operation.getValue(8),
27897 Operation.getValue(9)});
27898 }
27899 case Intrinsic::x86_testui: {
27900 SDLoc dl(Op);
27901 SDValue Chain = Op.getOperand(0);
27902 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27903 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27904 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27905 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27906 Operation.getValue(1));
27907 }
27908 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27909 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27910 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27911 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27912 case Intrinsic::x86_t2rpntlvwz0_internal:
27913 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27914 case Intrinsic::x86_t2rpntlvwz1_internal:
27915 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27916 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27918 unsigned IntNo = Op.getConstantOperandVal(1);
27919 unsigned Opc = 0;
27920 switch (IntNo) {
27921 default:
27922 llvm_unreachable("Unexpected intrinsic!");
27923 case Intrinsic::x86_t2rpntlvwz0_internal:
27924 Opc = X86::PT2RPNTLVWZ0V;
27925 break;
27926 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27927 Opc = X86::PT2RPNTLVWZ0T1V;
27928 break;
27929 case Intrinsic::x86_t2rpntlvwz1_internal:
27930 Opc = X86::PT2RPNTLVWZ1V;
27931 break;
27932 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27933 Opc = X86::PT2RPNTLVWZ1T1V;
27934 break;
27935 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27936 Opc = X86::PT2RPNTLVWZ0RSV;
27937 break;
27938 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27939 Opc = X86::PT2RPNTLVWZ0RST1V;
27940 break;
27941 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27942 Opc = X86::PT2RPNTLVWZ1RSV;
27943 break;
27944 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27945 Opc = X86::PT2RPNTLVWZ1RST1V;
27946 break;
27947 }
27948
27949 SDLoc DL(Op);
27950 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27951
27952 SDValue Ops[] = {Op.getOperand(2), // Row
27953 Op.getOperand(3), // Col0
27954 Op.getOperand(4), // Col1
27955 Op.getOperand(5), // Base
27956 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27957 Op.getOperand(6), // Index
27958 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27959 DAG.getRegister(0, MVT::i16), // Segment
27960 Op.getOperand(0)}; // Chain
27961
27962 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27963 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27964 SDValue(Res, 0));
27965 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27966 SDValue(Res, 0));
27967 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27968 }
27969 case Intrinsic::x86_atomic_bts_rm:
27970 case Intrinsic::x86_atomic_btc_rm:
27971 case Intrinsic::x86_atomic_btr_rm: {
27972 SDLoc DL(Op);
27973 MVT VT = Op.getSimpleValueType();
27974 SDValue Chain = Op.getOperand(0);
27975 SDValue Op1 = Op.getOperand(2);
27976 SDValue Op2 = Op.getOperand(3);
27977 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27978 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27980 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27981 SDValue Res =
27982 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27983 {Chain, Op1, Op2}, VT, MMO);
27984 Chain = Res.getValue(1);
27985 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27986 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27987 }
27988 case Intrinsic::x86_atomic_bts:
27989 case Intrinsic::x86_atomic_btc:
27990 case Intrinsic::x86_atomic_btr: {
27991 SDLoc DL(Op);
27992 MVT VT = Op.getSimpleValueType();
27993 SDValue Chain = Op.getOperand(0);
27994 SDValue Op1 = Op.getOperand(2);
27995 SDValue Op2 = Op.getOperand(3);
27996 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27997 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27998 : X86ISD::LBTR;
27999 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28000 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28001 SDValue Res =
28002 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28003 {Chain, Op1, Op2, Size}, VT, MMO);
28004 Chain = Res.getValue(1);
28005 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28006 unsigned Imm = Op2->getAsZExtVal();
28007 if (Imm)
28008 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28009 DAG.getShiftAmountConstant(Imm, VT, DL));
28010 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28011 }
28012 case Intrinsic::x86_cmpccxadd32:
28013 case Intrinsic::x86_cmpccxadd64: {
28014 SDLoc DL(Op);
28015 SDValue Chain = Op.getOperand(0);
28016 SDValue Addr = Op.getOperand(2);
28017 SDValue Src1 = Op.getOperand(3);
28018 SDValue Src2 = Op.getOperand(4);
28019 SDValue CC = Op.getOperand(5);
28020 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28022 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28023 MVT::i32, MMO);
28024 return Operation;
28025 }
28026 case Intrinsic::x86_aadd32:
28027 case Intrinsic::x86_aadd64:
28028 case Intrinsic::x86_aand32:
28029 case Intrinsic::x86_aand64:
28030 case Intrinsic::x86_aor32:
28031 case Intrinsic::x86_aor64:
28032 case Intrinsic::x86_axor32:
28033 case Intrinsic::x86_axor64: {
28034 SDLoc DL(Op);
28035 SDValue Chain = Op.getOperand(0);
28036 SDValue Op1 = Op.getOperand(2);
28037 SDValue Op2 = Op.getOperand(3);
28038 MVT VT = Op2.getSimpleValueType();
28039 unsigned Opc = 0;
28040 switch (IntNo) {
28041 default:
28042 llvm_unreachable("Unknown Intrinsic");
28043 case Intrinsic::x86_aadd32:
28044 case Intrinsic::x86_aadd64:
28045 Opc = X86ISD::AADD;
28046 break;
28047 case Intrinsic::x86_aand32:
28048 case Intrinsic::x86_aand64:
28049 Opc = X86ISD::AAND;
28050 break;
28051 case Intrinsic::x86_aor32:
28052 case Intrinsic::x86_aor64:
28053 Opc = X86ISD::AOR;
28054 break;
28055 case Intrinsic::x86_axor32:
28056 case Intrinsic::x86_axor64:
28057 Opc = X86ISD::AXOR;
28058 break;
28059 }
28060 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28061 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28062 {Chain, Op1, Op2}, VT, MMO);
28063 }
28064 case Intrinsic::x86_atomic_add_cc:
28065 case Intrinsic::x86_atomic_sub_cc:
28066 case Intrinsic::x86_atomic_or_cc:
28067 case Intrinsic::x86_atomic_and_cc:
28068 case Intrinsic::x86_atomic_xor_cc: {
28069 SDLoc DL(Op);
28070 SDValue Chain = Op.getOperand(0);
28071 SDValue Op1 = Op.getOperand(2);
28072 SDValue Op2 = Op.getOperand(3);
28073 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28074 MVT VT = Op2.getSimpleValueType();
28075 unsigned Opc = 0;
28076 switch (IntNo) {
28077 default:
28078 llvm_unreachable("Unknown Intrinsic");
28079 case Intrinsic::x86_atomic_add_cc:
28080 Opc = X86ISD::LADD;
28081 break;
28082 case Intrinsic::x86_atomic_sub_cc:
28083 Opc = X86ISD::LSUB;
28084 break;
28085 case Intrinsic::x86_atomic_or_cc:
28086 Opc = X86ISD::LOR;
28087 break;
28088 case Intrinsic::x86_atomic_and_cc:
28089 Opc = X86ISD::LAND;
28090 break;
28091 case Intrinsic::x86_atomic_xor_cc:
28092 Opc = X86ISD::LXOR;
28093 break;
28094 }
28095 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28096 SDValue LockArith =
28097 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28098 {Chain, Op1, Op2}, VT, MMO);
28099 Chain = LockArith.getValue(1);
28100 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28101 }
28102 }
28103 return SDValue();
28104 }
28105
28106 SDLoc dl(Op);
28107 switch(IntrData->Type) {
28108 default: llvm_unreachable("Unknown Intrinsic Type");
28109 case RDSEED:
28110 case RDRAND: {
28111 // Emit the node with the right value type.
28112 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28113 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28114
28115 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28116 // Otherwise return the value from Rand, which is always 0, casted to i32.
28117 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28118 DAG.getConstant(1, dl, Op->getValueType(1)),
28119 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28120 SDValue(Result.getNode(), 1)};
28121 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28122
28123 // Return { result, isValid, chain }.
28124 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28125 SDValue(Result.getNode(), 2));
28126 }
28127 case GATHER_AVX2: {
28128 SDValue Chain = Op.getOperand(0);
28129 SDValue Src = Op.getOperand(2);
28130 SDValue Base = Op.getOperand(3);
28131 SDValue Index = Op.getOperand(4);
28132 SDValue Mask = Op.getOperand(5);
28133 SDValue Scale = Op.getOperand(6);
28134 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28135 Scale, Chain, Subtarget);
28136 }
28137 case GATHER: {
28138 //gather(v1, mask, index, base, scale);
28139 SDValue Chain = Op.getOperand(0);
28140 SDValue Src = Op.getOperand(2);
28141 SDValue Base = Op.getOperand(3);
28142 SDValue Index = Op.getOperand(4);
28143 SDValue Mask = Op.getOperand(5);
28144 SDValue Scale = Op.getOperand(6);
28145 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28146 Chain, Subtarget);
28147 }
28148 case SCATTER: {
28149 //scatter(base, mask, index, v1, scale);
28150 SDValue Chain = Op.getOperand(0);
28151 SDValue Base = Op.getOperand(2);
28152 SDValue Mask = Op.getOperand(3);
28153 SDValue Index = Op.getOperand(4);
28154 SDValue Src = Op.getOperand(5);
28155 SDValue Scale = Op.getOperand(6);
28156 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28157 Scale, Chain, Subtarget);
28158 }
28159 case PREFETCH: {
28160 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28161 assert((HintVal == 2 || HintVal == 3) &&
28162 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28163 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28164 SDValue Chain = Op.getOperand(0);
28165 SDValue Mask = Op.getOperand(2);
28166 SDValue Index = Op.getOperand(3);
28167 SDValue Base = Op.getOperand(4);
28168 SDValue Scale = Op.getOperand(5);
28169 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28170 Subtarget);
28171 }
28172 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28173 case RDTSC: {
28175 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28176 Results);
28177 return DAG.getMergeValues(Results, dl);
28178 }
28179 // Read Performance Monitoring Counters.
28180 case RDPMC:
28181 // Read Processor Register.
28182 case RDPRU:
28183 // GetExtended Control Register.
28184 case XGETBV: {
28186
28187 // RDPMC uses ECX to select the index of the performance counter to read.
28188 // RDPRU uses ECX to select the processor register to read.
28189 // XGETBV uses ECX to select the index of the XCR register to return.
28190 // The result is stored into registers EDX:EAX.
28191 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28192 Subtarget, Results);
28193 return DAG.getMergeValues(Results, dl);
28194 }
28195 // XTEST intrinsics.
28196 case XTEST: {
28197 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28198 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28199
28200 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28201 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28202 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28203 Ret, SDValue(InTrans.getNode(), 1));
28204 }
28207 case TRUNCATE_TO_MEM_VI32: {
28208 SDValue Mask = Op.getOperand(4);
28209 SDValue DataToTruncate = Op.getOperand(3);
28210 SDValue Addr = Op.getOperand(2);
28211 SDValue Chain = Op.getOperand(0);
28212
28213 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
28214 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28215
28216 EVT MemVT = MemIntr->getMemoryVT();
28217
28218 uint16_t TruncationOp = IntrData->Opc0;
28219 switch (TruncationOp) {
28220 case X86ISD::VTRUNC: {
28221 if (isAllOnesConstant(Mask)) // return just a truncate store
28222 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28223 MemIntr->getMemOperand());
28224
28225 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28226 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28227 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28228
28229 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28230 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28231 true /* truncating */);
28232 }
28233 case X86ISD::VTRUNCUS:
28234 case X86ISD::VTRUNCS: {
28235 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28236 if (isAllOnesConstant(Mask))
28237 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28238 MemIntr->getMemOperand(), DAG);
28239
28240 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28241 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28242
28243 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28244 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28245 }
28246 default:
28247 llvm_unreachable("Unsupported truncstore intrinsic");
28248 }
28249 }
28250 case INTR_TYPE_CAST_MMX:
28251 return SDValue(); // handled in combineINTRINSIC_*
28252 }
28253}
28254
28255SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28256 SelectionDAG &DAG) const {
28258 MFI.setReturnAddressIsTaken(true);
28259
28260 unsigned Depth = Op.getConstantOperandVal(0);
28261 SDLoc dl(Op);
28262 EVT PtrVT = Op.getValueType();
28263
28264 if (Depth > 0) {
28265 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28266 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28267 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28268 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28269 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28271 }
28272
28273 // Just load the return address.
28274 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28275 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28277}
28278
28279SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28280 SelectionDAG &DAG) const {
28282 return getReturnAddressFrameIndex(DAG);
28283}
28284
28285SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28287 MachineFrameInfo &MFI = MF.getFrameInfo();
28289 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28290 EVT VT = Op.getValueType();
28291
28292 MFI.setFrameAddressIsTaken(true);
28293
28294 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28295 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28296 // is not possible to crawl up the stack without looking at the unwind codes
28297 // simultaneously.
28298 int FrameAddrIndex = FuncInfo->getFAIndex();
28299 if (!FrameAddrIndex) {
28300 // Set up a frame object for the return address.
28301 unsigned SlotSize = RegInfo->getSlotSize();
28302 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28303 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28304 FuncInfo->setFAIndex(FrameAddrIndex);
28305 }
28306 return DAG.getFrameIndex(FrameAddrIndex, VT);
28307 }
28308
28309 Register FrameReg =
28310 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28311 SDLoc dl(Op); // FIXME probably not meaningful
28312 unsigned Depth = Op.getConstantOperandVal(0);
28313 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28314 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28315 "Invalid Frame Register!");
28316 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28317 while (Depth--)
28318 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28320 return FrameAddr;
28321}
28322
28323// FIXME? Maybe this could be a TableGen attribute on some registers and
28324// this table could be generated automatically from RegInfo.
28326 const MachineFunction &MF) const {
28327 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28328
28330 .Case("esp", X86::ESP)
28331 .Case("rsp", X86::RSP)
28332 .Case("ebp", X86::EBP)
28333 .Case("rbp", X86::RBP)
28334 .Case("r14", X86::R14)
28335 .Case("r15", X86::R15)
28336 .Default(0);
28337
28338 if (Reg == X86::EBP || Reg == X86::RBP) {
28339 if (!TFI.hasFP(MF))
28340 report_fatal_error("register " + StringRef(RegName) +
28341 " is allocatable: function has no frame pointer");
28342#ifndef NDEBUG
28343 else {
28344 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28345 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28346 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28347 "Invalid Frame Register!");
28348 }
28349#endif
28350 }
28351
28352 return Reg;
28353}
28354
28355SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28356 SelectionDAG &DAG) const {
28357 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28358 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28359}
28360
28362 const Constant *PersonalityFn) const {
28363 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28364 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28365
28366 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28367}
28368
28370 const Constant *PersonalityFn) const {
28371 // Funclet personalities don't use selectors (the runtime does the selection).
28373 return X86::NoRegister;
28374 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28375}
28376
28378 return Subtarget.isTargetWin64();
28379}
28380
28381SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28382 SDValue Chain = Op.getOperand(0);
28383 SDValue Offset = Op.getOperand(1);
28384 SDValue Handler = Op.getOperand(2);
28385 SDLoc dl (Op);
28386
28387 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28388 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28389 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28390 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28391 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28392 "Invalid Frame Register!");
28393 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28394 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28395
28396 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28397 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28398 dl));
28399 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28400 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28401 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28402
28403 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28404 DAG.getRegister(StoreAddrReg, PtrVT));
28405}
28406
28407SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28408 SelectionDAG &DAG) const {
28409 SDLoc DL(Op);
28410 // If the subtarget is not 64bit, we may need the global base reg
28411 // after isel expand pseudo, i.e., after CGBR pass ran.
28412 // Therefore, ask for the GlobalBaseReg now, so that the pass
28413 // inserts the code for us in case we need it.
28414 // Otherwise, we will end up in a situation where we will
28415 // reference a virtual register that is not defined!
28416 if (!Subtarget.is64Bit()) {
28417 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28418 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28419 }
28420 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28421 DAG.getVTList(MVT::i32, MVT::Other),
28422 Op.getOperand(0), Op.getOperand(1));
28423}
28424
28425SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28426 SelectionDAG &DAG) const {
28427 SDLoc DL(Op);
28428 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28429 Op.getOperand(0), Op.getOperand(1));
28430}
28431
28432SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28433 SelectionDAG &DAG) const {
28434 SDLoc DL(Op);
28435 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28436 Op.getOperand(0));
28437}
28438
28440 return Op.getOperand(0);
28441}
28442
28443SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28444 SelectionDAG &DAG) const {
28445 SDValue Root = Op.getOperand(0);
28446 SDValue Trmp = Op.getOperand(1); // trampoline
28447 SDValue FPtr = Op.getOperand(2); // nested function
28448 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28449 SDLoc dl (Op);
28450
28451 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28452 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28453
28454 if (Subtarget.is64Bit()) {
28455 SDValue OutChains[6];
28456
28457 // Large code-model.
28458 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28459 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28460
28461 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28462 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28463
28464 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28465
28466 // Load the pointer to the nested function into R11.
28467 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28468 SDValue Addr = Trmp;
28469 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28470 Addr, MachinePointerInfo(TrmpAddr));
28471
28472 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28473 DAG.getConstant(2, dl, MVT::i64));
28474 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28475 MachinePointerInfo(TrmpAddr, 2), Align(2));
28476
28477 // Load the 'nest' parameter value into R10.
28478 // R10 is specified in X86CallingConv.td
28479 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28480 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28481 DAG.getConstant(10, dl, MVT::i64));
28482 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28483 Addr, MachinePointerInfo(TrmpAddr, 10));
28484
28485 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28486 DAG.getConstant(12, dl, MVT::i64));
28487 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28488 MachinePointerInfo(TrmpAddr, 12), Align(2));
28489
28490 // Jump to the nested function.
28491 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28492 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28493 DAG.getConstant(20, dl, MVT::i64));
28494 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28495 Addr, MachinePointerInfo(TrmpAddr, 20));
28496
28497 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28498 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28499 DAG.getConstant(22, dl, MVT::i64));
28500 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28501 Addr, MachinePointerInfo(TrmpAddr, 22));
28502
28503 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28504 } else {
28505 const Function *Func =
28506 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28507 CallingConv::ID CC = Func->getCallingConv();
28508 unsigned NestReg;
28509
28510 switch (CC) {
28511 default:
28512 llvm_unreachable("Unsupported calling convention");
28513 case CallingConv::C:
28515 // Pass 'nest' parameter in ECX.
28516 // Must be kept in sync with X86CallingConv.td
28517 NestReg = X86::ECX;
28518
28519 // Check that ECX wasn't needed by an 'inreg' parameter.
28520 FunctionType *FTy = Func->getFunctionType();
28521 const AttributeList &Attrs = Func->getAttributes();
28522
28523 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28524 unsigned InRegCount = 0;
28525 unsigned Idx = 0;
28526
28527 for (FunctionType::param_iterator I = FTy->param_begin(),
28528 E = FTy->param_end(); I != E; ++I, ++Idx)
28529 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28530 const DataLayout &DL = DAG.getDataLayout();
28531 // FIXME: should only count parameters that are lowered to integers.
28532 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28533 }
28534
28535 if (InRegCount > 2) {
28536 report_fatal_error("Nest register in use - reduce number of inreg"
28537 " parameters!");
28538 }
28539 }
28540 break;
28541 }
28544 case CallingConv::Fast:
28545 case CallingConv::Tail:
28547 // Pass 'nest' parameter in EAX.
28548 // Must be kept in sync with X86CallingConv.td
28549 NestReg = X86::EAX;
28550 break;
28551 }
28552
28553 SDValue OutChains[4];
28554 SDValue Addr, Disp;
28555
28556 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28557 DAG.getConstant(10, dl, MVT::i32));
28558 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28559
28560 // This is storing the opcode for MOV32ri.
28561 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28562 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28563 OutChains[0] =
28564 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28565 Trmp, MachinePointerInfo(TrmpAddr));
28566
28567 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28568 DAG.getConstant(1, dl, MVT::i32));
28569 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28570 MachinePointerInfo(TrmpAddr, 1), Align(1));
28571
28572 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28573 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28574 DAG.getConstant(5, dl, MVT::i32));
28575 OutChains[2] =
28576 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28577 MachinePointerInfo(TrmpAddr, 5), Align(1));
28578
28579 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28580 DAG.getConstant(6, dl, MVT::i32));
28581 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28582 MachinePointerInfo(TrmpAddr, 6), Align(1));
28583
28584 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28585 }
28586}
28587
28588SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28589 SelectionDAG &DAG) const {
28590 /*
28591 The rounding mode is in bits 11:10 of FPSR, and has the following
28592 settings:
28593 00 Round to nearest
28594 01 Round to -inf
28595 10 Round to +inf
28596 11 Round to 0
28597
28598 GET_ROUNDING, on the other hand, expects the following:
28599 -1 Undefined
28600 0 Round to 0
28601 1 Round to nearest
28602 2 Round to +inf
28603 3 Round to -inf
28604
28605 To perform the conversion, we use a packed lookup table of the four 2-bit
28606 values that we can index by FPSP[11:10]
28607 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28608
28609 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28610 */
28611
28613 MVT VT = Op.getSimpleValueType();
28614 SDLoc DL(Op);
28615
28616 // Save FP Control Word to stack slot
28617 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28618 SDValue StackSlot =
28619 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28620
28622
28623 SDValue Chain = Op.getOperand(0);
28624 SDValue Ops[] = {Chain, StackSlot};
28626 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28628
28629 // Load FP Control Word from stack slot
28630 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28631 Chain = CWD.getValue(1);
28632
28633 // Mask and turn the control bits into a shift for the lookup table.
28634 SDValue Shift =
28635 DAG.getNode(ISD::SRL, DL, MVT::i16,
28636 DAG.getNode(ISD::AND, DL, MVT::i16,
28637 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28638 DAG.getConstant(9, DL, MVT::i8));
28639 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28640
28641 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28642 SDValue RetVal =
28643 DAG.getNode(ISD::AND, DL, MVT::i32,
28644 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28645 DAG.getConstant(3, DL, MVT::i32));
28646
28647 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28648
28649 return DAG.getMergeValues({RetVal, Chain}, DL);
28650}
28651
28652SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28653 SelectionDAG &DAG) const {
28655 SDLoc DL(Op);
28656 SDValue Chain = Op.getNode()->getOperand(0);
28657
28658 // FP control word may be set only from data in memory. So we need to allocate
28659 // stack space to save/load FP control word.
28660 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28661 SDValue StackSlot =
28662 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28664 MachineMemOperand *MMO =
28666
28667 // Store FP control word into memory.
28668 SDValue Ops[] = {Chain, StackSlot};
28669 Chain = DAG.getMemIntrinsicNode(
28670 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28671
28672 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28673 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28674 Chain = CWD.getValue(1);
28675 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28676 DAG.getConstant(0xf3ff, DL, MVT::i16));
28677
28678 // Calculate new rounding mode.
28679 SDValue NewRM = Op.getNode()->getOperand(1);
28680 SDValue RMBits;
28681 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28682 uint64_t RM = CVal->getZExtValue();
28683 int FieldVal;
28684 switch (static_cast<RoundingMode>(RM)) {
28685 // clang-format off
28686 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28687 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28688 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28689 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28690 default:
28691 llvm_unreachable("rounding mode is not supported by X86 hardware");
28692 // clang-format on
28693 }
28694 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28695 } else {
28696 // Need to convert argument into bits of control word:
28697 // 0 Round to 0 -> 11
28698 // 1 Round to nearest -> 00
28699 // 2 Round to +inf -> 10
28700 // 3 Round to -inf -> 01
28701 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28702 // To make the conversion, put all these values into a value 0xc9 and shift
28703 // it left depending on the rounding mode:
28704 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28705 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28706 // ...
28707 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28708 SDValue ShiftValue =
28709 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28710 DAG.getNode(ISD::ADD, DL, MVT::i32,
28711 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28712 DAG.getConstant(1, DL, MVT::i8)),
28713 DAG.getConstant(4, DL, MVT::i32)));
28714 SDValue Shifted =
28715 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28716 ShiftValue);
28717 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28718 DAG.getConstant(0xc00, DL, MVT::i16));
28719 }
28720
28721 // Update rounding mode bits and store the new FP Control Word into stack.
28722 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28723 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28724
28725 // Load FP control word from the slot.
28726 SDValue OpsLD[] = {Chain, StackSlot};
28727 MachineMemOperand *MMOL =
28729 Chain = DAG.getMemIntrinsicNode(
28730 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28731
28732 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28733 // same way but in bits 14:13.
28734 if (Subtarget.hasSSE1()) {
28735 // Store MXCSR into memory.
28736 Chain = DAG.getNode(
28737 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28738 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28739 StackSlot);
28740
28741 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28742 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28743 Chain = CWD.getValue(1);
28744 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28745 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28746
28747 // Shift X87 RM bits from 11:10 to 14:13.
28748 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28749 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28750 DAG.getConstant(3, DL, MVT::i8));
28751
28752 // Update rounding mode bits and store the new FP Control Word into stack.
28753 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28754 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28755
28756 // Load MXCSR from the slot.
28757 Chain = DAG.getNode(
28758 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28759 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28760 StackSlot);
28761 }
28762
28763 return Chain;
28764}
28765
28766const unsigned X87StateSize = 28;
28767const unsigned FPStateSize = 32;
28768[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28769
28770SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28771 SelectionDAG &DAG) const {
28773 SDLoc DL(Op);
28774 SDValue Chain = Op->getOperand(0);
28775 SDValue Ptr = Op->getOperand(1);
28776 auto *Node = cast<FPStateAccessSDNode>(Op);
28777 EVT MemVT = Node->getMemoryVT();
28779 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28780
28781 // Get x87 state, if it presents.
28782 if (Subtarget.hasX87()) {
28783 Chain =
28784 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28785 {Chain, Ptr}, MemVT, MMO);
28786
28787 // FNSTENV changes the exception mask, so load back the stored environment.
28788 MachineMemOperand::Flags NewFlags =
28790 (MMO->getFlags() & ~MachineMemOperand::MOStore);
28791 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28792 Chain =
28793 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28794 {Chain, Ptr}, MemVT, MMO);
28795 }
28796
28797 // If target supports SSE, get MXCSR as well.
28798 if (Subtarget.hasSSE1()) {
28799 // Get pointer to the MXCSR location in memory.
28801 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28802 DAG.getConstant(X87StateSize, DL, PtrVT));
28803 // Store MXCSR into memory.
28804 Chain = DAG.getNode(
28805 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28806 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28807 MXCSRAddr);
28808 }
28809
28810 return Chain;
28811}
28812
28814 EVT MemVT, MachineMemOperand *MMO,
28815 SelectionDAG &DAG,
28816 const X86Subtarget &Subtarget) {
28817 // Set x87 state, if it presents.
28818 if (Subtarget.hasX87())
28819 Chain =
28820 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28821 {Chain, Ptr}, MemVT, MMO);
28822 // If target supports SSE, set MXCSR as well.
28823 if (Subtarget.hasSSE1()) {
28824 // Get pointer to the MXCSR location in memory.
28826 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28827 DAG.getConstant(X87StateSize, DL, PtrVT));
28828 // Load MXCSR from memory.
28829 Chain = DAG.getNode(
28830 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28831 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28832 MXCSRAddr);
28833 }
28834 return Chain;
28835}
28836
28837SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28838 SelectionDAG &DAG) const {
28839 SDLoc DL(Op);
28840 SDValue Chain = Op->getOperand(0);
28841 SDValue Ptr = Op->getOperand(1);
28842 auto *Node = cast<FPStateAccessSDNode>(Op);
28843 EVT MemVT = Node->getMemoryVT();
28845 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28846 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28847}
28848
28849SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28850 SelectionDAG &DAG) const {
28852 SDLoc DL(Op);
28853 SDValue Chain = Op.getNode()->getOperand(0);
28854
28855 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28856 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28858
28859 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28860 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28861 // for compatibility with glibc.
28862 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28863 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28864 Constant *Zero = ConstantInt::get(ItemTy, 0);
28865 for (unsigned I = 0; I < 6; ++I)
28866 FPEnvVals.push_back(Zero);
28867
28868 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28869 // all exceptions, sets DAZ and FTZ to 0.
28870 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28871 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28873 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28874 MachinePointerInfo MPI =
28878
28879 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28880}
28881
28882// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28883uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28884 assert((Amt < 8) && "Shift/Rotation amount out of range");
28885 switch (Opcode) {
28886 case ISD::BITREVERSE:
28887 return 0x8040201008040201ULL;
28888 case ISD::SHL:
28889 return ((0x0102040810204080ULL >> (Amt)) &
28890 (0x0101010101010101ULL * (0xFF >> (Amt))));
28891 case ISD::SRL:
28892 return ((0x0102040810204080ULL << (Amt)) &
28893 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28894 case ISD::SRA:
28895 return (getGFNICtrlImm(ISD::SRL, Amt) |
28896 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28897 case ISD::ROTL:
28898 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28899 case ISD::ROTR:
28900 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28901 }
28902 llvm_unreachable("Unsupported GFNI opcode");
28903}
28904
28905// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28906SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28907 MVT VT, unsigned Amt = 0) {
28908 assert(VT.getVectorElementType() == MVT::i8 &&
28909 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28910 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28911 SmallVector<SDValue> MaskBits;
28912 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28913 uint64_t Bits = (Imm >> (I % 64)) & 255;
28914 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28915 }
28916 return DAG.getBuildVector(VT, DL, MaskBits);
28917}
28918
28919/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28920//
28921// i8/i16 vector implemented using dword LZCNT vector instruction
28922// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28923// split the vector, perform operation on it's Lo a Hi part and
28924// concatenate the results.
28926 const X86Subtarget &Subtarget) {
28927 assert(Op.getOpcode() == ISD::CTLZ);
28928 SDLoc dl(Op);
28929 MVT VT = Op.getSimpleValueType();
28930 MVT EltVT = VT.getVectorElementType();
28931 unsigned NumElems = VT.getVectorNumElements();
28932
28933 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28934 "Unsupported element type");
28935
28936 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28937 if (NumElems > 16 ||
28938 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28939 return splitVectorIntUnary(Op, DAG, dl);
28940
28941 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28942 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28943 "Unsupported value type for operation");
28944
28945 // Use native supported vector instruction vplzcntd.
28946 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28947 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28948 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28949 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28950
28951 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28952}
28953
28954// Lower CTLZ using a PSHUFB lookup table implementation.
28956 const X86Subtarget &Subtarget,
28957 SelectionDAG &DAG) {
28958 MVT VT = Op.getSimpleValueType();
28959 int NumElts = VT.getVectorNumElements();
28960 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28961 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28962
28963 // Per-nibble leading zero PSHUFB lookup table.
28964 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28965 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28966 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28967 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28968
28970 for (int i = 0; i < NumBytes; ++i)
28971 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28972 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28973
28974 // Begin by bitcasting the input to byte vector, then split those bytes
28975 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28976 // If the hi input nibble is zero then we add both results together, otherwise
28977 // we just take the hi result (by masking the lo result to zero before the
28978 // add).
28979 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28980 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28981
28982 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28983 SDValue Lo = Op0;
28984 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28985 SDValue HiZ;
28986 if (CurrVT.is512BitVector()) {
28987 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28988 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28989 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28990 } else {
28991 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28992 }
28993
28994 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28995 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28996 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28997 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28998
28999 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29000 // of the current vector width in the same way we did for the nibbles.
29001 // If the upper half of the input element is zero then add the halves'
29002 // leading zero counts together, otherwise just use the upper half's.
29003 // Double the width of the result until we are at target width.
29004 while (CurrVT != VT) {
29005 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29006 int CurrNumElts = CurrVT.getVectorNumElements();
29007 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29008 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29009 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29010
29011 // Check if the upper half of the input element is zero.
29012 if (CurrVT.is512BitVector()) {
29013 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29014 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29015 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29016 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29017 } else {
29018 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29019 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29020 }
29021 HiZ = DAG.getBitcast(NextVT, HiZ);
29022
29023 // Move the upper/lower halves to the lower bits as we'll be extending to
29024 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29025 // together.
29026 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29027 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29028 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29029 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29030 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29031 CurrVT = NextVT;
29032 }
29033
29034 return Res;
29035}
29036
29038 const X86Subtarget &Subtarget,
29039 SelectionDAG &DAG) {
29040 MVT VT = Op.getSimpleValueType();
29041
29042 if (Subtarget.hasCDI() &&
29043 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29044 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29045 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29046
29047 // Decompose 256-bit ops into smaller 128-bit ops.
29048 if (VT.is256BitVector() && !Subtarget.hasInt256())
29049 return splitVectorIntUnary(Op, DAG, DL);
29050
29051 // Decompose 512-bit ops into smaller 256-bit ops.
29052 if (VT.is512BitVector() && !Subtarget.hasBWI())
29053 return splitVectorIntUnary(Op, DAG, DL);
29054
29055 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29056 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29057}
29058
29060 SelectionDAG &DAG,
29061 const X86Subtarget &Subtarget) {
29062 MVT VT = Op.getSimpleValueType();
29063 SDValue Input = Op.getOperand(0);
29064
29065 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29066 "Expected vXi8 input for GFNI-based CTLZ lowering");
29067
29068 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29069
29070 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29071 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29072
29073 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29074 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29075 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29076
29077 SDValue LZCNT =
29078 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29079 DAG.getTargetConstant(8, DL, MVT::i8));
29080 return LZCNT;
29081}
29082
29083static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29084 SelectionDAG &DAG) {
29085 MVT VT = Op.getSimpleValueType();
29086 MVT OpVT = VT;
29087 unsigned NumBits = VT.getSizeInBits();
29088 SDLoc dl(Op);
29089 unsigned Opc = Op.getOpcode();
29090
29091 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29092 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29093
29094 if (VT.isVector())
29095 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29096
29097 Op = Op.getOperand(0);
29098 if (VT == MVT::i8) {
29099 // Zero extend to i32 since there is not an i8 bsr.
29100 OpVT = MVT::i32;
29101 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29102 }
29103
29104 // Check if we can safely pass a result though BSR for zero sources.
29105 SDValue PassThru = DAG.getUNDEF(OpVT);
29106 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29107 !DAG.isKnownNeverZero(Op))
29108 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29109
29110 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29111 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29112 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29113
29114 // Skip CMOV if we're using a pass through value.
29115 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29116 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29117 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29118 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29119 Op.getValue(1)};
29120 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29121 }
29122
29123 // Finally xor with NumBits-1.
29124 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29125 DAG.getConstant(NumBits - 1, dl, OpVT));
29126
29127 if (VT == MVT::i8)
29128 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29129 return Op;
29130}
29131
29132static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29133 SelectionDAG &DAG) {
29134 MVT VT = Op.getSimpleValueType();
29135 unsigned NumBits = VT.getScalarSizeInBits();
29136 SDValue N0 = Op.getOperand(0);
29137 SDLoc dl(Op);
29138 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29139
29140 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29141 "Only scalar CTTZ requires custom lowering");
29142
29143 // Check if we can safely pass a result though BSF for zero sources.
29144 SDValue PassThru = DAG.getUNDEF(VT);
29145 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29146 PassThru = DAG.getConstant(NumBits, dl, VT);
29147
29148 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29149 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29150 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29151
29152 // Skip CMOV if src is never zero or we're using a pass through value.
29153 if (NonZeroSrc || !PassThru.isUndef())
29154 return Op;
29155
29156 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29157 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29158 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29159 Op.getValue(1)};
29160 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29161}
29162
29164 const X86Subtarget &Subtarget) {
29165 MVT VT = Op.getSimpleValueType();
29166 SDLoc DL(Op);
29167
29168 if (VT == MVT::i16 || VT == MVT::i32)
29169 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29170
29171 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29172 return splitVectorIntBinary(Op, DAG, DL);
29173
29174 assert(Op.getSimpleValueType().is256BitVector() &&
29175 Op.getSimpleValueType().isInteger() &&
29176 "Only handle AVX 256-bit vector integer operation");
29177 return splitVectorIntBinary(Op, DAG, DL);
29178}
29179
29181 const X86Subtarget &Subtarget) {
29182 MVT VT = Op.getSimpleValueType();
29183 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29184 unsigned Opcode = Op.getOpcode();
29185 SDLoc DL(Op);
29186
29187 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29188 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29189 assert(Op.getSimpleValueType().isInteger() &&
29190 "Only handle AVX vector integer operation");
29191 return splitVectorIntBinary(Op, DAG, DL);
29192 }
29193
29194 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29195 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29196 EVT SetCCResultType =
29197 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29198
29199 unsigned BitWidth = VT.getScalarSizeInBits();
29200 if (Opcode == ISD::USUBSAT) {
29201 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29202 // Handle a special-case with a bit-hack instead of cmp+select:
29203 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29204 // If the target can use VPTERNLOG, DAGToDAG will match this as
29205 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29206 // "broadcast" constant load.
29208 if (C && C->getAPIntValue().isSignMask()) {
29209 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29210 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29211 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29212 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29213 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29214 }
29215 }
29216 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29217 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29218 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29219 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29220 // TODO: Move this to DAGCombiner?
29221 if (SetCCResultType == VT &&
29222 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29223 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29224 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29225 }
29226 }
29227
29228 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29229 (!VT.isVector() || VT == MVT::v2i64)) {
29232 SDValue Zero = DAG.getConstant(0, DL, VT);
29233 SDValue Result =
29234 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29235 DAG.getVTList(VT, SetCCResultType), X, Y);
29236 SDValue SumDiff = Result.getValue(0);
29237 SDValue Overflow = Result.getValue(1);
29238 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29239 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29240 SDValue SumNeg =
29241 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29242 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29243 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29244 }
29245
29246 // Use default expansion.
29247 return SDValue();
29248}
29249
29250static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29251 SelectionDAG &DAG) {
29252 MVT VT = Op.getSimpleValueType();
29253 SDLoc DL(Op);
29254
29255 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29256 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29257 // 8-bit integer abs to NEG and CMOV.
29258 SDValue N0 = Op.getOperand(0);
29259 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29260 DAG.getConstant(0, DL, VT), N0);
29261 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29262 SDValue(Neg.getNode(), 1)};
29263 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29264 }
29265
29266 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29267 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29268 SDValue Src = Op.getOperand(0);
29269 SDValue Neg = DAG.getNegative(Src, DL, VT);
29270 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29271 }
29272
29273 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29274 assert(VT.isInteger() &&
29275 "Only handle AVX 256-bit vector integer operation");
29276 return splitVectorIntUnary(Op, DAG, DL);
29277 }
29278
29279 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29280 return splitVectorIntUnary(Op, DAG, DL);
29281
29282 // Default to expand.
29283 return SDValue();
29284}
29285
29286static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29287 SelectionDAG &DAG) {
29288 MVT VT = Op.getSimpleValueType();
29289 SDLoc DL(Op);
29290
29291 // For AVX1 cases, split to use legal ops.
29292 if (VT.is256BitVector() && !Subtarget.hasInt256())
29293 return splitVectorIntBinary(Op, DAG, DL);
29294
29295 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29296 return splitVectorIntBinary(Op, DAG, DL);
29297
29298 // Default to expand.
29299 return SDValue();
29300}
29301
29302static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29303 SelectionDAG &DAG) {
29304 MVT VT = Op.getSimpleValueType();
29305 SDLoc DL(Op);
29306
29307 // For AVX1 cases, split to use legal ops.
29308 if (VT.is256BitVector() && !Subtarget.hasInt256())
29309 return splitVectorIntBinary(Op, DAG, DL);
29310
29311 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29312 return splitVectorIntBinary(Op, DAG, DL);
29313
29314 // Default to expand.
29315 return SDValue();
29316}
29317
29319 SelectionDAG &DAG) {
29320 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29321 EVT VT = Op.getValueType();
29322 SDValue X = Op.getOperand(0);
29323 SDValue Y = Op.getOperand(1);
29324 SDLoc DL(Op);
29325 bool IsMaxOp =
29326 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29327 bool IsNum =
29328 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29329 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29330 unsigned Opc = 0;
29331 if (VT.isVector())
29333 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29335
29336 if (Opc) {
29337 SDValue Imm =
29338 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29339 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29340 }
29341 }
29342
29343 uint64_t SizeInBits = VT.getScalarSizeInBits();
29344 APInt PreferredZero = APInt::getZero(SizeInBits);
29345 APInt OppositeZero = PreferredZero;
29346 EVT IVT = VT.changeTypeToInteger();
29347 X86ISD::NodeType MinMaxOp;
29348 if (IsMaxOp) {
29349 MinMaxOp = X86ISD::FMAX;
29350 OppositeZero.setSignBit();
29351 } else {
29352 PreferredZero.setSignBit();
29353 MinMaxOp = X86ISD::FMIN;
29354 }
29355 EVT SetCCType =
29356 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29357
29358 // The tables below show the expected result of Max in cases of NaN and
29359 // signed zeros.
29360 //
29361 // Y Y
29362 // Num xNaN +0 -0
29363 // --------------- ---------------
29364 // Num | Max | Y | +0 | +0 | +0 |
29365 // X --------------- X ---------------
29366 // xNaN | X | X/Y | -0 | +0 | -0 |
29367 // --------------- ---------------
29368 //
29369 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29370 // reordering.
29371 //
29372 // We check if any of operands is NaN and return NaN. Then we check if any of
29373 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29374 // to ensure the correct zero is returned.
29375 auto MatchesZero = [](SDValue Op, APInt Zero) {
29377 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29378 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29379 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29380 return CstOp->getAPIntValue() == Zero;
29381 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29382 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29383 for (const SDValue &OpVal : Op->op_values()) {
29384 if (OpVal.isUndef())
29385 continue;
29386 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29387 if (!CstOp)
29388 return false;
29389 if (!CstOp->getValueAPF().isZero())
29390 continue;
29391 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29392 return false;
29393 }
29394 return true;
29395 }
29396 return false;
29397 };
29398
29399 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29400 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29401 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29402 Op->getFlags().hasNoSignedZeros() ||
29403 DAG.isKnownNeverZeroFloat(X) ||
29405 SDValue NewX, NewY;
29406 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29407 MatchesZero(X, OppositeZero)) {
29408 // Operands are already in right order or order does not matter.
29409 NewX = X;
29410 NewY = Y;
29411 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29412 NewX = Y;
29413 NewY = X;
29414 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29415 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29416 if (IsXNeverNaN)
29417 std::swap(X, Y);
29418 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29419 // xmm register.
29420 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29422 // Bits of classes:
29423 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29424 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29425 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29426 DL, MVT::i32);
29427 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29428 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29429 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29430 DAG.getVectorIdxConstant(0, DL));
29431 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29432 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29433 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29434 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29435 } else {
29436 SDValue IsXSigned;
29437 if (Subtarget.is64Bit() || VT != MVT::f64) {
29438 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29439 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29440 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29441 } else {
29442 assert(VT == MVT::f64);
29443 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29444 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29445 DAG.getVectorIdxConstant(0, DL));
29446 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29447 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29448 DAG.getVectorIdxConstant(1, DL));
29449 Hi = DAG.getBitcast(MVT::i32, Hi);
29450 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29451 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29452 *DAG.getContext(), MVT::i32);
29453 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29454 }
29455 if (MinMaxOp == X86ISD::FMAX) {
29456 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29457 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29458 } else {
29459 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29460 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29461 }
29462 }
29463
29464 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29465 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29466
29467 // If we did no ordering operands for signed zero handling and we need
29468 // to process NaN and we know that one of the operands is not NaN then:
29469 // - For minimum/maximum, put it in the first operand,
29470 // - For minimumnum/maximumnum, put it in the second operand,
29471 // and we will not need to post handle NaN after max/min.
29472 if (IgnoreSignedZero && !IgnoreNaN &&
29473 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29474 std::swap(NewX, NewY);
29475
29476 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29477
29478 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29479 return MinMax;
29480
29481 if (DAG.isKnownNeverNaN(NewX))
29482 NewX = NewY;
29483
29484 SDValue IsNaN =
29485 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29486
29487 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29488}
29489
29490static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29491 SelectionDAG &DAG) {
29492 MVT VT = Op.getSimpleValueType();
29493 SDLoc dl(Op);
29494
29495 // For AVX1 cases, split to use legal ops.
29496 if (VT.is256BitVector() && !Subtarget.hasInt256())
29497 return splitVectorIntBinary(Op, DAG, dl);
29498
29499 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29500 return splitVectorIntBinary(Op, DAG, dl);
29501
29502 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29504
29505 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29506 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29507 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29508
29509 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29510 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29511 if (VT.bitsGE(MVT::i32)) {
29512 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29513 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29514 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29515 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29516 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29517 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29518 DAG.getTargetConstant(CC, dl, MVT::i8),
29519 Diff1.getValue(1));
29520 }
29521
29522 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29523 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29524 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29525 MVT WideVT = MVT::getIntegerVT(WideBits);
29526 if (TLI.isTypeLegal(WideVT)) {
29527 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29528 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29529 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29530 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29531 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29532 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29533 DAG.getTargetConstant(CC, dl, MVT::i8),
29534 Diff1.getValue(1));
29535 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29536 }
29537 }
29538
29539 // Default to expand.
29540 return SDValue();
29541}
29542
29543static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29544 SelectionDAG &DAG) {
29545 SDLoc dl(Op);
29546 MVT VT = Op.getSimpleValueType();
29547
29548 // Decompose 256-bit ops into 128-bit ops.
29549 if (VT.is256BitVector() && !Subtarget.hasInt256())
29550 return splitVectorIntBinary(Op, DAG, dl);
29551
29552 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29553 return splitVectorIntBinary(Op, DAG, dl);
29554
29555 SDValue A = Op.getOperand(0);
29556 SDValue B = Op.getOperand(1);
29557
29558 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29559 // vector pairs, multiply and truncate.
29560 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29561 unsigned NumElts = VT.getVectorNumElements();
29562 unsigned NumLanes = VT.getSizeInBits() / 128;
29563 unsigned NumEltsPerLane = NumElts / NumLanes;
29564
29565 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29566 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29567 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29568 return DAG.getNode(
29569 ISD::TRUNCATE, dl, VT,
29570 DAG.getNode(ISD::MUL, dl, ExVT,
29571 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29572 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29573 }
29574
29575 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29576
29577 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29578 // Don't do this if we only need to unpack one half.
29579 if (Subtarget.hasSSSE3()) {
29580 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29581 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29582 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29583 if (BIsBuildVector) {
29584 for (auto [Idx, Val] : enumerate(B->ops())) {
29585 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29586 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29587 else
29588 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29589 }
29590 }
29591 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29592 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29593 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29594 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29595 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29596 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29597 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29598 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29599 DAG.getTargetConstant(8, dl, MVT::i8));
29600 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29601 }
29602 }
29603
29604 // Extract the lo/hi parts to any extend to i16.
29605 // We're going to mask off the low byte of each result element of the
29606 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29607 // element.
29608 SDValue Undef = DAG.getUNDEF(VT);
29609 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29610 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29611
29612 SDValue BLo, BHi;
29613 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29614 // If the RHS is a constant, manually unpackl/unpackh.
29615 SmallVector<SDValue, 16> LoOps, HiOps;
29616 for (unsigned i = 0; i != NumElts; i += 16) {
29617 for (unsigned j = 0; j != 8; ++j) {
29618 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29619 MVT::i16));
29620 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29621 MVT::i16));
29622 }
29623 }
29624
29625 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29626 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29627 } else {
29628 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29629 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29630 }
29631
29632 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29633 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29634 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29635 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29636 }
29637
29638 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29639 if (VT == MVT::v4i32) {
29640 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29641 "Should not custom lower when pmulld is available!");
29642
29643 // Extract the odd parts.
29644 static const int UnpackMask[] = {1, 1, 3, 3};
29645 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29646 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29647
29648 // Multiply the even parts.
29649 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29650 DAG.getBitcast(MVT::v2i64, A),
29651 DAG.getBitcast(MVT::v2i64, B));
29652 // Now multiply odd parts.
29653 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29654 DAG.getBitcast(MVT::v2i64, Aodds),
29655 DAG.getBitcast(MVT::v2i64, Bodds));
29656
29657 Evens = DAG.getBitcast(VT, Evens);
29658 Odds = DAG.getBitcast(VT, Odds);
29659
29660 // Merge the two vectors back together with a shuffle. This expands into 2
29661 // shuffles.
29662 static const int ShufMask[] = { 0, 4, 2, 6 };
29663 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29664 }
29665
29666 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29667 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29668 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29669
29670 // Ahi = psrlqi(a, 32);
29671 // Bhi = psrlqi(b, 32);
29672 //
29673 // AloBlo = pmuludq(a, b);
29674 // AloBhi = pmuludq(a, Bhi);
29675 // AhiBlo = pmuludq(Ahi, b);
29676 //
29677 // Hi = psllqi(AloBhi + AhiBlo, 32);
29678 // return AloBlo + Hi;
29679 KnownBits AKnown = DAG.computeKnownBits(A);
29680 KnownBits BKnown = DAG.computeKnownBits(B);
29681
29682 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29683 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29684 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29685
29686 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29687 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29688 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29689
29690 SDValue Zero = DAG.getConstant(0, dl, VT);
29691
29692 // Only multiply lo/hi halves that aren't known to be zero.
29693 SDValue AloBlo = Zero;
29694 if (!ALoIsZero && !BLoIsZero)
29695 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29696
29697 SDValue AloBhi = Zero;
29698 if (!ALoIsZero && !BHiIsZero) {
29699 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29700 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29701 }
29702
29703 SDValue AhiBlo = Zero;
29704 if (!AHiIsZero && !BLoIsZero) {
29705 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29706 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29707 }
29708
29709 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29710 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29711
29712 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29713}
29714
29716 MVT VT, bool IsSigned,
29717 const X86Subtarget &Subtarget,
29718 SelectionDAG &DAG,
29719 SDValue *Low = nullptr) {
29720 unsigned NumElts = VT.getVectorNumElements();
29721
29722 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29723 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29724 // lane results back together.
29725
29726 // We'll take different approaches for signed and unsigned.
29727 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29728 // and use pmullw to calculate the full 16-bit product.
29729 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29730 // shift them left into the upper byte of each word. This allows us to use
29731 // pmulhw to calculate the full 16-bit product. This trick means we don't
29732 // need to sign extend the bytes to use pmullw.
29733
29734 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29735 SDValue Zero = DAG.getConstant(0, dl, VT);
29736
29737 SDValue ALo, AHi;
29738 if (IsSigned) {
29739 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29740 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29741 } else {
29742 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29743 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29744 }
29745
29746 SDValue BLo, BHi;
29747 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29748 // If the RHS is a constant, manually unpackl/unpackh and extend.
29749 SmallVector<SDValue, 16> LoOps, HiOps;
29750 for (unsigned i = 0; i != NumElts; i += 16) {
29751 for (unsigned j = 0; j != 8; ++j) {
29752 SDValue LoOp = B.getOperand(i + j);
29753 SDValue HiOp = B.getOperand(i + j + 8);
29754
29755 if (IsSigned) {
29756 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29757 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29758 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29759 DAG.getConstant(8, dl, MVT::i16));
29760 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29761 DAG.getConstant(8, dl, MVT::i16));
29762 } else {
29763 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29764 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29765 }
29766
29767 LoOps.push_back(LoOp);
29768 HiOps.push_back(HiOp);
29769 }
29770 }
29771
29772 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29773 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29774 } else if (IsSigned) {
29775 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29776 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29777 } else {
29778 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29779 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29780 }
29781
29782 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29783 // pack back to vXi8.
29784 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29785 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29786 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29787
29788 if (Low)
29789 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29790
29791 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29792}
29793
29794static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29795 SelectionDAG &DAG) {
29796 SDLoc dl(Op);
29797 MVT VT = Op.getSimpleValueType();
29798 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29799 unsigned NumElts = VT.getVectorNumElements();
29800 SDValue A = Op.getOperand(0);
29801 SDValue B = Op.getOperand(1);
29802
29803 // Decompose 256-bit ops into 128-bit ops.
29804 if (VT.is256BitVector() && !Subtarget.hasInt256())
29805 return splitVectorIntBinary(Op, DAG, dl);
29806
29807 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29808 return splitVectorIntBinary(Op, DAG, dl);
29809
29810 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29811 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29812 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29813 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29814
29815 // PMULxD operations multiply each even value (starting at 0) of LHS with
29816 // the related value of RHS and produce a widen result.
29817 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29818 // => <2 x i64> <ae|cg>
29819 //
29820 // In other word, to have all the results, we need to perform two PMULxD:
29821 // 1. one with the even values.
29822 // 2. one with the odd values.
29823 // To achieve #2, with need to place the odd values at an even position.
29824 //
29825 // Place the odd value at an even position (basically, shift all values 1
29826 // step to the left):
29827 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29828 9, -1, 11, -1, 13, -1, 15, -1};
29829 // <a|b|c|d> => <b|undef|d|undef>
29830 SDValue Odd0 =
29831 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29832 // <e|f|g|h> => <f|undef|h|undef>
29833 SDValue Odd1 =
29834 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29835
29836 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29837 // ints.
29838 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29839 unsigned Opcode =
29840 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29841 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29842 // => <2 x i64> <ae|cg>
29843 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29844 DAG.getBitcast(MulVT, A),
29845 DAG.getBitcast(MulVT, B)));
29846 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29847 // => <2 x i64> <bf|dh>
29848 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29849 DAG.getBitcast(MulVT, Odd0),
29850 DAG.getBitcast(MulVT, Odd1)));
29851
29852 // Shuffle it back into the right order.
29853 SmallVector<int, 16> ShufMask(NumElts);
29854 for (int i = 0; i != (int)NumElts; ++i)
29855 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29856
29857 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29858
29859 // If we have a signed multiply but no PMULDQ fix up the result of an
29860 // unsigned multiply.
29861 if (IsSigned && !Subtarget.hasSSE41()) {
29862 SDValue Zero = DAG.getConstant(0, dl, VT);
29863 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29864 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29865 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29866 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29867
29868 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29869 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29870 }
29871
29872 return Res;
29873 }
29874
29875 // Only i8 vectors should need custom lowering after this.
29876 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29877 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29878 "Unsupported vector type");
29879
29880 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29881 // logical shift down the upper half and pack back to i8.
29882
29883 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29884 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29885
29886 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29887 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29888 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29889 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29890 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29891 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29892 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29893 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29894 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29895 }
29896
29897 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29898}
29899
29900// Custom lowering for SMULO/UMULO.
29901static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29902 SelectionDAG &DAG) {
29903 MVT VT = Op.getSimpleValueType();
29904
29905 // Scalars defer to LowerXALUO.
29906 if (!VT.isVector())
29907 return LowerXALUO(Op, DAG);
29908
29909 SDLoc dl(Op);
29910 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29911 SDValue A = Op.getOperand(0);
29912 SDValue B = Op.getOperand(1);
29913 EVT OvfVT = Op->getValueType(1);
29914
29915 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29916 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29917 // Extract the LHS Lo/Hi vectors
29918 SDValue LHSLo, LHSHi;
29919 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29920
29921 // Extract the RHS Lo/Hi vectors
29922 SDValue RHSLo, RHSHi;
29923 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29924
29925 EVT LoOvfVT, HiOvfVT;
29926 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29927 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29928 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29929
29930 // Issue the split operations.
29931 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29932 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29933
29934 // Join the separate data results and the overflow results.
29935 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29936 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29937 Hi.getValue(1));
29938
29939 return DAG.getMergeValues({Res, Ovf}, dl);
29940 }
29941
29942 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29943 EVT SetccVT =
29944 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29945
29946 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29947 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29948 unsigned NumElts = VT.getVectorNumElements();
29949 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29950 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29951 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29952 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29953 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29954
29955 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29956
29957 SDValue Ovf;
29958 if (IsSigned) {
29959 SDValue High, LowSign;
29960 if (OvfVT.getVectorElementType() == MVT::i1 &&
29961 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29962 // Rather the truncating try to do the compare on vXi16 or vXi32.
29963 // Shift the high down filling with sign bits.
29964 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29965 // Fill all 16 bits with the sign bit from the low.
29966 LowSign =
29967 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29968 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29969 15, DAG);
29970 SetccVT = OvfVT;
29971 if (!Subtarget.hasBWI()) {
29972 // We can't do a vXi16 compare so sign extend to v16i32.
29973 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29974 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29975 }
29976 } else {
29977 // Otherwise do the compare at vXi8.
29978 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29979 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29980 LowSign =
29981 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29982 }
29983
29984 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29985 } else {
29986 SDValue High =
29987 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29988 if (OvfVT.getVectorElementType() == MVT::i1 &&
29989 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29990 // Rather the truncating try to do the compare on vXi16 or vXi32.
29991 SetccVT = OvfVT;
29992 if (!Subtarget.hasBWI()) {
29993 // We can't do a vXi16 compare so sign extend to v16i32.
29994 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
29995 }
29996 } else {
29997 // Otherwise do the compare at vXi8.
29998 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29999 }
30000
30001 Ovf =
30002 DAG.getSetCC(dl, SetccVT, High,
30003 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30004 }
30005
30006 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30007
30008 return DAG.getMergeValues({Low, Ovf}, dl);
30009 }
30010
30011 SDValue Low;
30012 SDValue High =
30013 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30014
30015 SDValue Ovf;
30016 if (IsSigned) {
30017 // SMULO overflows if the high bits don't match the sign of the low.
30018 SDValue LowSign =
30019 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30020 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30021 } else {
30022 // UMULO overflows if the high bits are non-zero.
30023 Ovf =
30024 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30025 }
30026
30027 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30028
30029 return DAG.getMergeValues({Low, Ovf}, dl);
30030}
30031
30032SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30033 assert(Subtarget.isTargetWin64() && "Unexpected target");
30034 EVT VT = Op.getValueType();
30035 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30036 "Unexpected return type for lowering");
30037
30038 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30040 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30041 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30042 }
30043
30044 RTLIB::Libcall LC;
30045 bool isSigned;
30046 switch (Op->getOpcode()) {
30047 // clang-format off
30048 default: llvm_unreachable("Unexpected request for libcall!");
30049 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30050 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30051 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30052 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30053 // clang-format on
30054 }
30055
30056 SDLoc dl(Op);
30057 SDValue InChain = DAG.getEntryNode();
30058
30060 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30061 EVT ArgVT = Op->getOperand(i).getValueType();
30062 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30063 "Unexpected argument type for lowering");
30064 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30065 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30066 MachinePointerInfo MPI =
30068 InChain =
30069 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30070 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30071 }
30072
30075
30077 CLI.setDebugLoc(dl)
30078 .setChain(InChain)
30079 .setLibCallee(
30081 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30082 std::move(Args))
30083 .setInRegister()
30084 .setSExtResult(isSigned)
30085 .setZExtResult(!isSigned);
30086
30087 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30088 return DAG.getBitcast(VT, CallInfo.first);
30089}
30090
30091SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30092 SelectionDAG &DAG,
30093 SDValue &Chain) const {
30094 assert(Subtarget.isTargetWin64() && "Unexpected target");
30095 EVT VT = Op.getValueType();
30096 bool IsStrict = Op->isStrictFPOpcode();
30097
30098 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30099 EVT ArgVT = Arg.getValueType();
30100
30101 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30102 "Unexpected return type for lowering");
30103
30104 RTLIB::Libcall LC;
30105 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30106 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30107 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30108 else
30109 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30110 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30111
30112 SDLoc dl(Op);
30113 MakeLibCallOptions CallOptions;
30114 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30115
30117 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30118 // expected VT (i128).
30119 std::tie(Result, Chain) =
30120 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30121 Result = DAG.getBitcast(VT, Result);
30122 return Result;
30123}
30124
30125SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30126 SelectionDAG &DAG) const {
30127 assert(Subtarget.isTargetWin64() && "Unexpected target");
30128 EVT VT = Op.getValueType();
30129 bool IsStrict = Op->isStrictFPOpcode();
30130
30131 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30132 EVT ArgVT = Arg.getValueType();
30133
30134 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30135 "Unexpected argument type for lowering");
30136
30137 RTLIB::Libcall LC;
30138 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30139 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30140 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30141 else
30142 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30143 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30144
30145 SDLoc dl(Op);
30146 MakeLibCallOptions CallOptions;
30147 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30148
30149 // Pass the i128 argument as an indirect argument on the stack.
30150 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30151 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30152 MachinePointerInfo MPI =
30154 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30155
30157 std::tie(Result, Chain) =
30158 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30159 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30160}
30161
30162// Return true if the required (according to Opcode) shift-imm form is natively
30163// supported by the Subtarget
30164static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30165 unsigned Opcode) {
30166 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30167 "Unexpected shift opcode");
30168
30169 if (!VT.isSimple())
30170 return false;
30171
30172 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30173 return false;
30174
30175 if (VT.getScalarSizeInBits() < 16)
30176 return false;
30177
30178 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30179 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30180 return true;
30181
30182 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30183 (VT.is256BitVector() && Subtarget.hasInt256());
30184
30185 bool AShift = LShift && (Subtarget.hasAVX512() ||
30186 (VT != MVT::v2i64 && VT != MVT::v4i64));
30187 return (Opcode == ISD::SRA) ? AShift : LShift;
30188}
30189
30190// The shift amount is a variable, but it is the same for all vector lanes.
30191// These instructions are defined together with shift-immediate.
30192static
30194 unsigned Opcode) {
30195 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30196}
30197
30198// Return true if the required (according to Opcode) variable-shift form is
30199// natively supported by the Subtarget
30200static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30201 unsigned Opcode) {
30202 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30203 "Unexpected shift opcode");
30204
30205 if (!VT.isSimple())
30206 return false;
30207
30208 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30209 return false;
30210
30211 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30212 return false;
30213
30214 // vXi16 supported only on AVX-512, BWI
30215 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30216 return false;
30217
30218 if (Subtarget.hasAVX512() &&
30219 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30220 return true;
30221
30222 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30223 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30224 return (Opcode == ISD::SRA) ? AShift : LShift;
30225}
30226
30228 const X86Subtarget &Subtarget) {
30229 MVT VT = Op.getSimpleValueType();
30230 SDLoc dl(Op);
30231 SDValue R = Op.getOperand(0);
30232 SDValue Amt = Op.getOperand(1);
30233 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30234 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30235
30236 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30237 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30238 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30239 SDValue Ex = DAG.getBitcast(ExVT, R);
30240
30241 // ashr(R, 63) === cmp_slt(R, 0)
30242 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30243 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30244 "Unsupported PCMPGT op");
30245 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30246 }
30247
30248 if (ShiftAmt >= 32) {
30249 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30250 SDValue Upper =
30251 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30253 ShiftAmt - 32, DAG);
30254 if (VT == MVT::v2i64)
30255 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30256 if (VT == MVT::v4i64)
30257 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30258 {9, 1, 11, 3, 13, 5, 15, 7});
30259 } else {
30260 // SRA upper i32, SRL whole i64 and select lower i32.
30262 ShiftAmt, DAG);
30263 SDValue Lower =
30264 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30265 Lower = DAG.getBitcast(ExVT, Lower);
30266 if (VT == MVT::v2i64)
30267 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30268 if (VT == MVT::v4i64)
30269 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30270 {8, 1, 10, 3, 12, 5, 14, 7});
30271 }
30272 return DAG.getBitcast(VT, Ex);
30273 };
30274
30275 // Optimize shl/srl/sra with constant shift amount.
30276 APInt APIntShiftAmt;
30277 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30278 return SDValue();
30279
30280 // If the shift amount is out of range, return undef.
30281 if (APIntShiftAmt.uge(EltSizeInBits))
30282 return DAG.getUNDEF(VT);
30283
30284 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30285
30286 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30287 // Hardware support for vector shifts is sparse which makes us scalarize the
30288 // vector operations in many cases. Also, on sandybridge ADD is faster than
30289 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30290 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30291 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30292 // must be 0). (add undef, undef) however can be any value. To make this
30293 // safe, we must freeze R to ensure that register allocation uses the same
30294 // register for an undefined value. This ensures that the result will
30295 // still be even and preserves the original semantics.
30296 R = DAG.getFreeze(R);
30297 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30298 }
30299
30300 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30301 }
30302
30303 // i64 SRA needs to be performed as partial shifts.
30304 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30305 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30306 Op.getOpcode() == ISD::SRA)
30307 return ArithmeticShiftRight64(ShiftAmt);
30308
30309 // If we're logical shifting an all-signbits value then we can just perform as
30310 // a mask.
30311 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30312 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30313 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30314 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30315 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30316 }
30317
30318 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30319 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30320 unsigned NumElts = VT.getVectorNumElements();
30321 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30322
30323 // Simple i8 add case
30324 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30325 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30326 // must be 0). (add undef, undef) however can be any value. To make this
30327 // safe, we must freeze R to ensure that register allocation uses the same
30328 // register for an undefined value. This ensures that the result will
30329 // still be even and preserves the original semantics.
30330 R = DAG.getFreeze(R);
30331 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30332 }
30333
30334 // ashr(R, 7) === cmp_slt(R, 0)
30335 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30336 SDValue Zeros = DAG.getConstant(0, dl, VT);
30337 if (VT.is512BitVector()) {
30338 assert(VT == MVT::v64i8 && "Unexpected element type!");
30339 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30340 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30341 }
30342 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30343 }
30344
30345 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30346 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30347 return SDValue();
30348
30349 if (Subtarget.hasGFNI()) {
30350 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30351 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30352 DAG.getTargetConstant(0, dl, MVT::i8));
30353 }
30354
30355 if (Op.getOpcode() == ISD::SHL) {
30356 // Make a large shift.
30357 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30358 ShiftAmt, DAG);
30359 SHL = DAG.getBitcast(VT, SHL);
30360 // Zero out the rightmost bits.
30361 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30362 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30363 }
30364 if (Op.getOpcode() == ISD::SRL) {
30365 // Make a large shift.
30366 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30367 ShiftAmt, DAG);
30368 SRL = DAG.getBitcast(VT, SRL);
30369 // Zero out the leftmost bits.
30370 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30371 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30372 }
30373 if (Op.getOpcode() == ISD::SRA) {
30374 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30375 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30376
30377 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30378 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30379 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30380 return Res;
30381 }
30382 llvm_unreachable("Unknown shift opcode.");
30383 }
30384
30385 return SDValue();
30386}
30387
30389 const X86Subtarget &Subtarget) {
30390 MVT VT = Op.getSimpleValueType();
30391 SDLoc dl(Op);
30392 SDValue R = Op.getOperand(0);
30393 SDValue Amt = Op.getOperand(1);
30394 unsigned Opcode = Op.getOpcode();
30395 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30396
30397 int BaseShAmtIdx = -1;
30398 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30399 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30400 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30401 Subtarget, DAG);
30402
30403 // vXi8 shifts - shift as v8i16 + mask result.
30404 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30405 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30406 VT == MVT::v64i8) &&
30407 !Subtarget.hasXOP()) {
30408 unsigned NumElts = VT.getVectorNumElements();
30409 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30410 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30411 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30412 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30413
30414 // Create the mask using vXi16 shifts. For shift-rights we need to move
30415 // the upper byte down before splatting the vXi8 mask.
30416 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30417 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30418 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30419 if (Opcode != ISD::SHL)
30420 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30421 8, DAG);
30422 BitMask = DAG.getBitcast(VT, BitMask);
30423 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30424 SmallVector<int, 64>(NumElts, 0));
30425
30426 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30427 DAG.getBitcast(ExtVT, R), BaseShAmt,
30428 BaseShAmtIdx, Subtarget, DAG);
30429 Res = DAG.getBitcast(VT, Res);
30430 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30431
30432 if (Opcode == ISD::SRA) {
30433 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30434 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30435 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30436 SignMask =
30437 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30438 BaseShAmtIdx, Subtarget, DAG);
30439 SignMask = DAG.getBitcast(VT, SignMask);
30440 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30441 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30442 }
30443 return Res;
30444 }
30445 }
30446 }
30447
30448 return SDValue();
30449}
30450
30451// Convert a shift/rotate left amount to a multiplication scale factor.
30453 const X86Subtarget &Subtarget,
30454 SelectionDAG &DAG) {
30455 MVT VT = Amt.getSimpleValueType();
30456 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30457 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30458 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30459 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30460 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30461 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30462 return SDValue();
30463
30464 MVT SVT = VT.getVectorElementType();
30465 unsigned SVTBits = SVT.getSizeInBits();
30466 unsigned NumElems = VT.getVectorNumElements();
30467
30468 APInt UndefElts;
30469 SmallVector<APInt> EltBits;
30470 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30471 APInt One(SVTBits, 1);
30472 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30473 for (unsigned I = 0; I != NumElems; ++I) {
30474 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30475 continue;
30476 uint64_t ShAmt = EltBits[I].getZExtValue();
30477 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30478 }
30479 return DAG.getBuildVector(VT, dl, Elts);
30480 }
30481
30482 // If the target doesn't support variable shifts, use either FP conversion
30483 // or integer multiplication to avoid shifting each element individually.
30484 if (VT == MVT::v4i32) {
30485 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30486 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30487 DAG.getConstant(0x3f800000U, dl, VT));
30488 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30489 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30490 }
30491
30492 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30493 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30494 SDValue Z = DAG.getConstant(0, dl, VT);
30495 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30496 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30497 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30498 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30499 if (Subtarget.hasSSE41())
30500 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30501 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30502 }
30503
30504 return SDValue();
30505}
30506
30507static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30508 SelectionDAG &DAG) {
30509 MVT VT = Op.getSimpleValueType();
30510 SDLoc dl(Op);
30511 SDValue R = Op.getOperand(0);
30512 SDValue Amt = Op.getOperand(1);
30513 unsigned NumElts = VT.getVectorNumElements();
30514 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30515 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30516
30517 unsigned Opc = Op.getOpcode();
30518 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30519 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30520
30521 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30522 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30523
30524 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30525 return V;
30526
30527 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30528 return V;
30529
30530 if (supportedVectorVarShift(VT, Subtarget, Opc))
30531 return Op;
30532
30533 // i64 vector arithmetic shift can be emulated with the transform:
30534 // M = lshr(SIGN_MASK, Amt)
30535 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30536 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30537 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30538 Opc == ISD::SRA) {
30539 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30540 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30541 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30542 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30543 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30544 return R;
30545 }
30546
30547 // XOP has 128-bit variable logical/arithmetic shifts.
30548 // +ve/-ve Amt = shift left/right.
30549 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30550 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30551 if (Opc == ISD::SRL || Opc == ISD::SRA)
30552 Amt = DAG.getNegative(Amt, dl, VT);
30553 if (Opc == ISD::SHL || Opc == ISD::SRL)
30554 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30555 if (Opc == ISD::SRA)
30556 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30557 }
30558
30559 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30560 // shifts per-lane and then shuffle the partial results back together.
30561 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30562 // Splat the shift amounts so the scalar shifts above will catch it.
30563 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30564 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30565 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30566 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30567 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30568 }
30569
30570 // Build a map of inrange constant amounts with element mask where they occur.
30572 if (ConstantAmt) {
30573 for (unsigned I = 0; I != NumElts; ++I) {
30574 SDValue A = Amt.getOperand(I);
30575 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30576 continue;
30577 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30578 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30579 if (!Inserted) {
30580 It->second.setBit(I);
30581 continue;
30582 }
30583 It->second = APInt::getOneBitSet(NumElts, I);
30584 }
30585 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30586 }
30587
30588 // If possible, lower this shift as a sequence of two shifts by
30589 // constant plus a BLENDing shuffle instead of scalarizing it.
30590 // Example:
30591 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30592 //
30593 // Could be rewritten as:
30594 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30595 //
30596 // The advantage is that the two shifts from the example would be
30597 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30598 if (UniqueCstAmt.size() == 2 &&
30599 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30600 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30601 unsigned AmtA = UniqueCstAmt.begin()->first;
30602 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30603 const APInt &MaskA = UniqueCstAmt.begin()->second;
30604 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30605 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30606 for (unsigned I = 0; I != NumElts; ++I) {
30607 if (MaskA[I])
30608 ShuffleMask[I] = I;
30609 if (MaskB[I])
30610 ShuffleMask[I] = I + NumElts;
30611 }
30612
30613 // Only perform this blend if we can perform it without loading a mask.
30614 if ((VT != MVT::v16i16 ||
30615 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30616 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30617 canWidenShuffleElements(ShuffleMask))) {
30618 SDValue Shift1 =
30619 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30620 SDValue Shift2 =
30621 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30622 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30623 }
30624 }
30625
30626 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30627 // using vYiM vector operations where X*N == Y*M and M > N.
30628 if (ConstantAmt &&
30629 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30630 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30631 !Subtarget.hasXOP()) {
30632 MVT NarrowScalarVT = VT.getScalarType();
30633 // We can do this extra fast if each pair of narrow elements is shifted by
30634 // the same amount by doing this SWAR style: use a shift to move the valid
30635 // bits to the right position, mask out any bits which crossed from one
30636 // element to the other.
30637 // This optimized lowering is only valid if the elements in a pair can
30638 // be treated identically.
30639 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30640 SmallVector<SDValue, 32> TmpAmtWideElts;
30641 int WideEltSizeInBits = EltSizeInBits;
30642 while (WideEltSizeInBits < 32) {
30643 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30644 // unprofitable.
30645 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30646 break;
30647 }
30648 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30649 bool SameShifts = true;
30650 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30651 unsigned DstI = SrcI / 2;
30652 // Both elements are undef? Make a note and keep going.
30653 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30654 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30655 continue;
30656 }
30657 // Even element is undef? We will shift it by the same shift amount as
30658 // the odd element.
30659 if (AmtWideElts[SrcI].isUndef()) {
30660 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30661 continue;
30662 }
30663 // Odd element is undef? We will shift it by the same shift amount as
30664 // the even element.
30665 if (AmtWideElts[SrcI + 1].isUndef()) {
30666 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30667 continue;
30668 }
30669 // Both elements are equal.
30670 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30671 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30672 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30673 continue;
30674 }
30675 // One of the provisional wide elements will not have the same shift
30676 // amount. Let's bail.
30677 SameShifts = false;
30678 break;
30679 }
30680 if (!SameShifts) {
30681 break;
30682 }
30683 WideEltSizeInBits *= 2;
30684 std::swap(TmpAmtWideElts, AmtWideElts);
30685 }
30686 APInt APIntShiftAmt;
30687 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30688 bool Profitable = WidenShift;
30689 // AVX512BW brings support for vpsllvw.
30690 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30691 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30692 Profitable = false;
30693 }
30694 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30695 // fairly cheaply in other ways.
30696 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30697 Profitable = false;
30698 }
30699 // Leave it up to GFNI if we have it around.
30700 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30701 // is probably a win to use other strategies in some cases.
30702 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30703 Profitable = false;
30704 }
30705
30706 // AVX1 does not have vpand which makes our masking impractical. It does
30707 // have vandps but that is an FP instruction and crossing FP<->int typically
30708 // has some cost.
30709 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30710 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30711 Profitable = false;
30712 }
30713 unsigned WideNumElts = AmtWideElts.size();
30714 // We are only dealing with identical pairs.
30715 if (Profitable && WideNumElts != NumElts) {
30716 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30717 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30718 // Cast the operand to vXiM.
30719 SDValue RWide = DAG.getBitcast(WideVT, R);
30720 // Create our new vector of shift amounts.
30721 SDValue AmtWide = DAG.getBuildVector(
30722 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30723 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30724 // Perform the actual shift.
30725 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30726 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30727 // Now we need to construct a mask which will "drop" bits that get
30728 // shifted past the LSB/MSB. For a logical shift left, it will look
30729 // like:
30730 // FullMask = (1 << EltSizeInBits) - 1
30731 // Mask = FullMask << Amt
30732 //
30733 // This masking ensures that bits cannot migrate from one narrow lane to
30734 // another. The construction of this mask will be constant folded.
30735 // The mask for a logical right shift is nearly identical, the only
30736 // difference is that the all ones mask is shifted right instead of left.
30737 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30738 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30739 Mask = DAG.getBitcast(WideVT, Mask);
30740 // Finally, we mask the shifted vector with the SWAR mask.
30741 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30742 Masked = DAG.getBitcast(VT, Masked);
30743 if (Opc != ISD::SRA) {
30744 // Logical shifts are complete at this point.
30745 return Masked;
30746 }
30747 // At this point, we have done a *logical* shift right. We now need to
30748 // sign extend the result so that we get behavior equivalent to an
30749 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30750 // are `EltSizeInBits-AmtWide` bits wide.
30751 //
30752 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30753 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30754 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30755 // can use the following trick to accomplish this:
30756 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30757 // (Masked ^ SignBitMask) - SignBitMask
30758 //
30759 // When the sign bit is already clear, this will compute:
30760 // Masked + SignBitMask - SignBitMask
30761 //
30762 // This is equal to Masked which is what we want: the sign bit was clear
30763 // so sign extending should be a no-op.
30764 //
30765 // When the sign bit is set, this will compute:
30766 // Masked - SignBitmask - SignBitMask
30767 //
30768 // This is equal to Masked - 2*SignBitMask which will correctly sign
30769 // extend our result.
30770 SDValue SplatHighBit =
30771 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30772 // This does not induce recursion, all operands are constants.
30773 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30774 SDValue FlippedSignBit =
30775 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30776 SDValue Subtraction =
30777 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30778 return Subtraction;
30779 }
30780 }
30781
30782 // If possible, lower this packed shift into a vector multiply instead of
30783 // expanding it into a sequence of scalar shifts.
30784 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30785 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30786 Subtarget.canExtendTo512BW())))
30787 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30788 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30789
30790 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30791 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30792 if (Opc == ISD::SRL && ConstantAmt &&
30793 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30794 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30795 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30796 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30797 SDValue Zero = DAG.getConstant(0, dl, VT);
30798 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30799 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30800 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30801 }
30802 }
30803
30804 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30805 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30806 // TODO: Special case handling for shift by 0/1, really we can afford either
30807 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30808 if (Opc == ISD::SRA && ConstantAmt &&
30809 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30810 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30811 !Subtarget.hasAVX512()) ||
30812 DAG.isKnownNeverZero(Amt))) {
30813 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30814 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30815 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30816 SDValue Amt0 =
30817 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30818 SDValue Amt1 =
30819 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30820 SDValue Sra1 =
30821 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30822 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30823 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30824 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30825 }
30826 }
30827
30828 // v4i32 Non Uniform Shifts.
30829 // If the shift amount is constant we can shift each lane using the SSE2
30830 // immediate shifts, else we need to zero-extend each lane to the lower i64
30831 // and shift using the SSE2 variable shifts.
30832 // The separate results can then be blended together.
30833 if (VT == MVT::v4i32) {
30834 SDValue Amt0, Amt1, Amt2, Amt3;
30835 if (ConstantAmt) {
30836 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30837 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30838 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30839 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30840 } else {
30841 // The SSE2 shifts use the lower i64 as the same shift amount for
30842 // all lanes and the upper i64 is ignored. On AVX we're better off
30843 // just zero-extending, but for SSE just duplicating the top 16-bits is
30844 // cheaper and has the same effect for out of range values.
30845 if (Subtarget.hasAVX()) {
30846 SDValue Z = DAG.getConstant(0, dl, VT);
30847 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30848 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30849 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30850 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30851 } else {
30852 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30853 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30854 {4, 5, 6, 7, -1, -1, -1, -1});
30855 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30856 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30857 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30858 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30859 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30860 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30861 }
30862 }
30863
30864 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30865 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30866 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30867 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30868 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30869
30870 // Merge the shifted lane results optimally with/without PBLENDW.
30871 // TODO - ideally shuffle combining would handle this.
30872 if (Subtarget.hasSSE41()) {
30873 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30874 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30875 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30876 }
30877 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30878 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30879 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30880 }
30881
30882 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30883 // look up the pre-computed shift values.
30884 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30885 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30886 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30887 unsigned NumLanes = VT.getSizeInBits() / 128u;
30888 unsigned NumEltsPerLane = NumElts / NumLanes;
30890 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30891 unsigned LoElt = Lane * NumEltsPerLane;
30892 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30893 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30894 if (!KnownLane.isConstant())
30895 break;
30896 const APInt &LaneSplat = KnownLane.getConstant();
30897 for (unsigned I = 0; I != 8; ++I) {
30898 if (Opc == ISD::SHL)
30899 LUT.push_back(LaneSplat.shl(I));
30900 else if (Opc == ISD::SRL)
30901 LUT.push_back(LaneSplat.lshr(I));
30902 else if (Opc == ISD::SRA)
30903 LUT.push_back(LaneSplat.ashr(I));
30904 }
30905 LUT.append(8, APInt::getZero(8));
30906 }
30907 if (LUT.size() == NumElts) {
30908 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30909 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30910 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30911 }
30912 }
30913
30914 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30915 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30916 // make the existing SSE solution better.
30917 // NOTE: We honor prefered vector width before promoting to 512-bits.
30918 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30919 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30920 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30921 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30922 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30923 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30924 "Unexpected vector type");
30925 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30926 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30927 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30928 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30929 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30930 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30931 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30932 }
30933
30934 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30935 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30936 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30937 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30938 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30939 !Subtarget.hasXOP()) {
30940 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30941 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30942
30943 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30944 // isn't legal).
30945 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30946 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30947 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30948 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30950 "Constant build vector expected");
30951
30952 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30953 bool IsSigned = Opc == ISD::SRA;
30954 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30955 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30956 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30957 return DAG.getZExtOrTrunc(R, dl, VT);
30958 }
30959
30960 SmallVector<SDValue, 16> LoAmt, HiAmt;
30961 for (unsigned i = 0; i != NumElts; i += 16) {
30962 for (int j = 0; j != 8; ++j) {
30963 LoAmt.push_back(Amt.getOperand(i + j));
30964 HiAmt.push_back(Amt.getOperand(i + j + 8));
30965 }
30966 }
30967
30968 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30969 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30970
30971 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30972 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30973 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30974 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30975 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30976 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30977 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30978 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30979 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30980 }
30981
30982 if (VT == MVT::v16i8 ||
30983 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30984 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30985 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30986
30987 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30988 if (VT.is512BitVector()) {
30989 // On AVX512BW targets we make use of the fact that VSELECT lowers
30990 // to a masked blend which selects bytes based just on the sign bit
30991 // extracted to a mask.
30992 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30993 V0 = DAG.getBitcast(VT, V0);
30994 V1 = DAG.getBitcast(VT, V1);
30995 Sel = DAG.getBitcast(VT, Sel);
30996 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
30997 ISD::SETGT);
30998 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
30999 } else if (Subtarget.hasSSE41()) {
31000 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31001 // on the sign bit.
31002 V0 = DAG.getBitcast(VT, V0);
31003 V1 = DAG.getBitcast(VT, V1);
31004 Sel = DAG.getBitcast(VT, Sel);
31005 return DAG.getBitcast(SelVT,
31006 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31007 }
31008 // On pre-SSE41 targets we test for the sign bit by comparing to
31009 // zero - a negative value will set all bits of the lanes to true
31010 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31011 SDValue Z = DAG.getConstant(0, dl, SelVT);
31012 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31013 return DAG.getSelect(dl, SelVT, C, V0, V1);
31014 };
31015
31016 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31017 // We can safely do this using i16 shifts as we're only interested in
31018 // the 3 lower bits of each byte.
31019 Amt = DAG.getBitcast(ExtVT, Amt);
31020 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31021 Amt = DAG.getBitcast(VT, Amt);
31022
31023 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31024 // r = VSELECT(r, shift(r, 4), a);
31025 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31026 R = SignBitSelect(VT, Amt, M, R);
31027
31028 // a += a
31029 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31030
31031 // r = VSELECT(r, shift(r, 2), a);
31032 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31033 R = SignBitSelect(VT, Amt, M, R);
31034
31035 // a += a
31036 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31037
31038 // return VSELECT(r, shift(r, 1), a);
31039 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31040 R = SignBitSelect(VT, Amt, M, R);
31041 return R;
31042 }
31043
31044 if (Opc == ISD::SRA) {
31045 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31046 // so we can correctly sign extend. We don't care what happens to the
31047 // lower byte.
31048 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31049 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31050 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31051 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31052 ALo = DAG.getBitcast(ExtVT, ALo);
31053 AHi = DAG.getBitcast(ExtVT, AHi);
31054 RLo = DAG.getBitcast(ExtVT, RLo);
31055 RHi = DAG.getBitcast(ExtVT, RHi);
31056
31057 // r = VSELECT(r, shift(r, 4), a);
31058 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31059 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31060 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31061 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31062
31063 // a += a
31064 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31065 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31066
31067 // r = VSELECT(r, shift(r, 2), a);
31068 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31069 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31070 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31071 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31072
31073 // a += a
31074 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31075 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31076
31077 // r = VSELECT(r, shift(r, 1), a);
31078 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31079 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31080 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31081 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31082
31083 // Logical shift the result back to the lower byte, leaving a zero upper
31084 // byte meaning that we can safely pack with PACKUSWB.
31085 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31086 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31087 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31088 }
31089 }
31090
31091 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31092 MVT ExtVT = MVT::v8i32;
31093 SDValue Z = DAG.getConstant(0, dl, VT);
31094 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31095 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31096 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31097 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31098 ALo = DAG.getBitcast(ExtVT, ALo);
31099 AHi = DAG.getBitcast(ExtVT, AHi);
31100 RLo = DAG.getBitcast(ExtVT, RLo);
31101 RHi = DAG.getBitcast(ExtVT, RHi);
31102 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31103 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31104 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31105 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31106 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31107 }
31108
31109 if (VT == MVT::v8i16) {
31110 // If we have a constant shift amount, the non-SSE41 path is best as
31111 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31112 bool UseSSE41 = Subtarget.hasSSE41() &&
31114
31115 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31116 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31117 // the sign bit.
31118 if (UseSSE41) {
31119 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31120 V0 = DAG.getBitcast(ExtVT, V0);
31121 V1 = DAG.getBitcast(ExtVT, V1);
31122 Sel = DAG.getBitcast(ExtVT, Sel);
31123 return DAG.getBitcast(
31124 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31125 }
31126 // On pre-SSE41 targets we splat the sign bit - a negative value will
31127 // set all bits of the lanes to true and VSELECT uses that in
31128 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31129 SDValue C =
31130 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31131 return DAG.getSelect(dl, VT, C, V0, V1);
31132 };
31133
31134 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31135 if (UseSSE41) {
31136 // On SSE41 targets we need to replicate the shift mask in both
31137 // bytes for PBLENDVB.
31138 Amt = DAG.getNode(
31139 ISD::OR, dl, VT,
31140 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31141 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31142 } else {
31143 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31144 }
31145
31146 // r = VSELECT(r, shift(r, 8), a);
31147 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31148 R = SignBitSelect(Amt, M, R);
31149
31150 // a += a
31151 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31152
31153 // r = VSELECT(r, shift(r, 4), a);
31154 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31155 R = SignBitSelect(Amt, M, R);
31156
31157 // a += a
31158 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31159
31160 // r = VSELECT(r, shift(r, 2), a);
31161 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31162 R = SignBitSelect(Amt, M, R);
31163
31164 // a += a
31165 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31166
31167 // return VSELECT(r, shift(r, 1), a);
31168 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31169 R = SignBitSelect(Amt, M, R);
31170 return R;
31171 }
31172
31173 // Decompose 256-bit shifts into 128-bit shifts.
31174 if (VT.is256BitVector())
31175 return splitVectorIntBinary(Op, DAG, dl);
31176
31177 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31178 return splitVectorIntBinary(Op, DAG, dl);
31179
31180 return SDValue();
31181}
31182
31184 SelectionDAG &DAG) {
31185 MVT VT = Op.getSimpleValueType();
31186 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31187 "Unexpected funnel shift opcode!");
31188
31189 SDLoc DL(Op);
31190 SDValue Op0 = Op.getOperand(0);
31191 SDValue Op1 = Op.getOperand(1);
31192 SDValue Amt = Op.getOperand(2);
31193 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31194 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31195
31196 if (VT.isVector()) {
31197 APInt APIntShiftAmt;
31198 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31199 unsigned NumElts = VT.getVectorNumElements();
31200
31201 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31202 if (IsFSHR)
31203 std::swap(Op0, Op1);
31204
31205 if (IsCstSplat) {
31206 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31207 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31208 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31209 {Op0, Op1, Imm}, DAG, Subtarget);
31210 }
31211 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31212 {Op0, Op1, Amt}, DAG, Subtarget);
31213 }
31214 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31215 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31216 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31217 "Unexpected funnel shift type!");
31218
31219 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31220 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31221 if (IsCstSplat) {
31222 // TODO: Can't use generic expansion as UNDEF amt elements can be
31223 // converted to other values when folded to shift amounts, losing the
31224 // splat.
31225 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31226 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31227 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31228 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31229 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31230
31231 if (EltSizeInBits == 8 &&
31232 (Subtarget.hasXOP() ||
31233 (useVPTERNLOG(Subtarget, VT) &&
31234 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31235 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31236 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31237 // the original vector width to handle cases where we split.
31238 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31239 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31240 SDValue ShX =
31241 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31242 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31243 SDValue ShY =
31244 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31245 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31246 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31247 DAG.getConstant(MaskX, DL, VT));
31248 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31249 DAG.getConstant(MaskY, DL, VT));
31250 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31251 }
31252
31253 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31254 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31255 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31256 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31257 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31258 }
31259
31260 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31261 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31262 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31263
31264 // Constant vXi16 funnel shifts can be efficiently handled by default.
31265 if (IsCst && EltSizeInBits == 16)
31266 return SDValue();
31267
31268 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31269 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31270 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31271
31272 // Split 256-bit integers on XOP/pre-AVX2 targets.
31273 // Split 512-bit integers on non 512-bit BWI targets.
31274 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31275 !Subtarget.hasAVX2())) ||
31276 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31277 EltSizeInBits < 32)) {
31278 // Pre-mask the amount modulo using the wider vector.
31279 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31280 return splitVectorOp(Op, DAG, DL);
31281 }
31282
31283 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31284 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31285 int ScalarAmtIdx = -1;
31286 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31287 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31288 if (EltSizeInBits == 16)
31289 return SDValue();
31290
31291 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31292 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31293 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31294 ScalarAmtIdx, Subtarget, DAG);
31295 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31296 ScalarAmtIdx, Subtarget, DAG);
31297 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31298 }
31299 }
31300
31301 MVT WideSVT = MVT::getIntegerVT(
31302 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31303 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31304
31305 // If per-element shifts are legal, fallback to generic expansion.
31306 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31307 return SDValue();
31308
31309 // Attempt to fold as:
31310 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31311 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31312 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31313 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31314 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31315 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31316 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31317 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31318 EltSizeInBits, DAG);
31319 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31320 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31321 if (!IsFSHR)
31322 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31323 EltSizeInBits, DAG);
31324 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31325 }
31326
31327 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31328 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31329 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31330 SDValue Z = DAG.getConstant(0, DL, VT);
31331 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31332 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31333 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31334 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31335 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31336 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31337 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31338 }
31339
31340 // Fallback to generic expansion.
31341 return SDValue();
31342 }
31343 assert(
31344 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31345 "Unexpected funnel shift type!");
31346
31347 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31348 bool OptForSize = DAG.shouldOptForSize();
31349 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31350
31351 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31352 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31353 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31354 !isa<ConstantSDNode>(Amt)) {
31355 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31356 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31357 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31358 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31359 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31360 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31361 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31362 if (IsFSHR) {
31363 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31364 } else {
31365 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31366 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31367 }
31368 return DAG.getZExtOrTrunc(Res, DL, VT);
31369 }
31370
31371 if (VT == MVT::i8 || ExpandFunnel)
31372 return SDValue();
31373
31374 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31375 if (VT == MVT::i16) {
31376 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31377 DAG.getConstant(15, DL, Amt.getValueType()));
31378 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31379 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31380 }
31381
31382 return Op;
31383}
31384
31385static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31386 SelectionDAG &DAG) {
31387 MVT VT = Op.getSimpleValueType();
31388 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31389
31390 SDLoc DL(Op);
31391 SDValue R = Op.getOperand(0);
31392 SDValue Amt = Op.getOperand(1);
31393 unsigned Opcode = Op.getOpcode();
31394 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31395 int NumElts = VT.getVectorNumElements();
31396 bool IsROTL = Opcode == ISD::ROTL;
31397
31398 // Check for constant splat rotation amount.
31399 APInt CstSplatValue;
31400 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31401
31402 // Check for splat rotate by zero.
31403 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31404 return R;
31405
31406 // AVX512 implicitly uses modulo rotation amounts.
31407 if ((Subtarget.hasVLX() ||
31408 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
31409 32 <= EltSizeInBits) {
31410 // Attempt to rotate by immediate.
31411 if (IsCstSplat) {
31412 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31413 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31414 return DAG.getNode(RotOpc, DL, VT, R,
31415 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31416 }
31417
31418 // Else, fall-back on VPROLV/VPRORV.
31419 return Op;
31420 }
31421
31422 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31423 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31424 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31425 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31426 }
31427
31428 SDValue Z = DAG.getConstant(0, DL, VT);
31429
31430 if (!IsROTL) {
31431 // If the ISD::ROTR amount is constant, we're always better converting to
31432 // ISD::ROTL.
31433 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31434 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31435
31436 // XOP targets always prefers ISD::ROTL.
31437 if (Subtarget.hasXOP())
31438 return DAG.getNode(ISD::ROTL, DL, VT, R,
31439 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31440 }
31441
31442 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31443 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31445 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31446 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31447 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31448 DAG.getTargetConstant(0, DL, MVT::i8));
31449 }
31450
31451 // Split 256-bit integers on XOP/pre-AVX2 targets.
31452 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31453 return splitVectorIntBinary(Op, DAG, DL);
31454
31455 // XOP has 128-bit vector variable + immediate rotates.
31456 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31457 // XOP implicitly uses modulo rotation amounts.
31458 if (Subtarget.hasXOP()) {
31459 assert(IsROTL && "Only ROTL expected");
31460 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31461
31462 // Attempt to rotate by immediate.
31463 if (IsCstSplat) {
31464 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31465 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31466 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31467 }
31468
31469 // Use general rotate by variable (per-element).
31470 return Op;
31471 }
31472
31473 // Rotate by an uniform constant - expand back to shifts.
31474 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31475 // to other values when folded to shift amounts, losing the splat.
31476 if (IsCstSplat) {
31477 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31478 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31479 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31480 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31481 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31482 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31483 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31484 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31485 }
31486
31487 // Split 512-bit integers on non 512-bit BWI targets.
31488 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31489 return splitVectorIntBinary(Op, DAG, DL);
31490
31491 assert(
31492 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31493 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31494 Subtarget.hasAVX2()) ||
31495 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31496 "Only vXi32/vXi16/vXi8 vector rotates supported");
31497
31498 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31499 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31500
31501 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31502 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31503
31504 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31505 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31506 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31507 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31508 int BaseRotAmtIdx = -1;
31509 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31510 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31511 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31512 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31513 }
31514 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31515 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31516 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31517 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31518 BaseRotAmtIdx, Subtarget, DAG);
31519 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31520 BaseRotAmtIdx, Subtarget, DAG);
31521 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31522 }
31523 }
31524
31525 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31526 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31527
31528 // Attempt to fold as unpack(x,x) << zext(y):
31529 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31530 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31531 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31532 if (!(ConstantAmt && EltSizeInBits != 8) &&
31533 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31534 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31535 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31536 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31537 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31538 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31539 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31540 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31541 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31542 }
31543
31544 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31545 // the amount bit.
31546 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31547 if (EltSizeInBits == 8) {
31548 MVT WideVT =
31549 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31550
31551 // Attempt to fold as:
31552 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31553 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31554 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31555 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31556 // If we're rotating by constant, just use default promotion.
31557 if (ConstantAmt)
31558 return SDValue();
31559 // See if we can perform this by widening to vXi16 or vXi32.
31560 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31561 R = DAG.getNode(
31562 ISD::OR, DL, WideVT, R,
31563 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31564 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31565 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31566 if (IsROTL)
31567 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31568 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31569 }
31570
31571 // We don't need ModuloAmt here as we just peek at individual bits.
31572 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31573 if (Subtarget.hasSSE41()) {
31574 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31575 // on the sign bit.
31576 V0 = DAG.getBitcast(VT, V0);
31577 V1 = DAG.getBitcast(VT, V1);
31578 Sel = DAG.getBitcast(VT, Sel);
31579 return DAG.getBitcast(SelVT,
31580 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31581 }
31582 // On pre-SSE41 targets we test for the sign bit by comparing to
31583 // zero - a negative value will set all bits of the lanes to true
31584 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31585 SDValue Z = DAG.getConstant(0, DL, SelVT);
31586 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31587 return DAG.getSelect(DL, SelVT, C, V0, V1);
31588 };
31589
31590 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31591 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31592 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31593 IsROTL = true;
31594 }
31595
31596 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31597 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31598
31599 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31600 // We can safely do this using i16 shifts as we're only interested in
31601 // the 3 lower bits of each byte.
31602 Amt = DAG.getBitcast(ExtVT, Amt);
31603 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31604 Amt = DAG.getBitcast(VT, Amt);
31605
31606 // r = VSELECT(r, rot(r, 4), a);
31607 SDValue M;
31608 M = DAG.getNode(
31609 ISD::OR, DL, VT,
31610 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31611 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31612 R = SignBitSelect(VT, Amt, M, R);
31613
31614 // a += a
31615 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31616
31617 // r = VSELECT(r, rot(r, 2), a);
31618 M = DAG.getNode(
31619 ISD::OR, DL, VT,
31620 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31621 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31622 R = SignBitSelect(VT, Amt, M, R);
31623
31624 // a += a
31625 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31626
31627 // return VSELECT(r, rot(r, 1), a);
31628 M = DAG.getNode(
31629 ISD::OR, DL, VT,
31630 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31631 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31632 return SignBitSelect(VT, Amt, M, R);
31633 }
31634
31635 bool IsSplatAmt = DAG.isSplatValue(Amt);
31636 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31637 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31638
31639 // Fallback for splats + all supported variable shifts.
31640 // Fallback for non-constants AVX2 vXi16 as well.
31641 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31642 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31643 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31644 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31645 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31646 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31647 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31648 }
31649
31650 // Everything below assumes ISD::ROTL.
31651 if (!IsROTL) {
31652 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31653 IsROTL = true;
31654 }
31655
31656 // ISD::ROT* uses modulo rotate amounts.
31657 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31658
31659 assert(IsROTL && "Only ROTL supported");
31660
31661 // As with shifts, attempt to convert the rotation amount to a multiplication
31662 // factor, fallback to general expansion.
31663 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31664 if (!Scale)
31665 return SDValue();
31666
31667 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31668 if (EltSizeInBits == 16) {
31669 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31670 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31671 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31672 }
31673
31674 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31675 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31676 // that can then be OR'd with the lower 32-bits.
31677 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31678 static const int OddMask[] = {1, 1, 3, 3};
31679 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31680 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31681
31682 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31683 DAG.getBitcast(MVT::v2i64, R),
31684 DAG.getBitcast(MVT::v2i64, Scale));
31685 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31686 DAG.getBitcast(MVT::v2i64, R13),
31687 DAG.getBitcast(MVT::v2i64, Scale13));
31688 Res02 = DAG.getBitcast(VT, Res02);
31689 Res13 = DAG.getBitcast(VT, Res13);
31690
31691 return DAG.getNode(ISD::OR, DL, VT,
31692 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31693 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31694}
31695
31696/// Returns true if the operand type is exactly twice the native width, and
31697/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31698/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31699/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31700bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31701 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31702
31703 if (OpWidth == 64)
31704 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31705 if (OpWidth == 128)
31706 return Subtarget.canUseCMPXCHG16B();
31707
31708 return false;
31709}
31710
31712X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31713 Type *MemType = SI->getValueOperand()->getType();
31714
31715 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31716 !Subtarget.useSoftFloat()) {
31717 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31718 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31720
31721 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31722 Subtarget.hasAVX())
31724 }
31725
31726 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31728}
31729
31730// Note: this turns large loads into lock cmpxchg8b/16b.
31732X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31733 Type *MemType = LI->getType();
31734
31735 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31736 !Subtarget.useSoftFloat()) {
31737 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31738 // can use movq to do the load. If we have X87 we can load into an 80-bit
31739 // X87 register and store it to a stack temporary.
31740 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31741 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31743
31744 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31745 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31746 Subtarget.hasAVX())
31748 }
31749
31750 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31752}
31753
31754enum BitTestKind : unsigned {
31761
31762static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31763 using namespace llvm::PatternMatch;
31764 BitTestKind BTK = UndefBit;
31765 if (auto *C = dyn_cast<ConstantInt>(V)) {
31766 // Check if V is a power of 2 or NOT power of 2.
31767 if (isPowerOf2_64(C->getZExtValue()))
31768 BTK = ConstantBit;
31769 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31770 BTK = NotConstantBit;
31771 return {V, BTK};
31772 }
31773
31774 // Check if V is some power of 2 pattern known to be non-zero
31775 if (auto *I = dyn_cast<Instruction>(V)) {
31776 bool Not = false;
31777 // Check if we have a NOT
31778 Value *PeekI;
31779 if (match(I, m_Not(m_Value(PeekI))) ||
31780 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31781 Not = true;
31782 I = dyn_cast<Instruction>(PeekI);
31783
31784 // If I is constant, it will fold and we can evaluate later. If its an
31785 // argument or something of that nature, we can't analyze.
31786 if (I == nullptr)
31787 return {nullptr, UndefBit};
31788 }
31789 // We can only use 1 << X without more sophisticated analysis. C << X where
31790 // C is a power of 2 but not 1 can result in zero which cannot be translated
31791 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31792 if (I->getOpcode() == Instruction::Shl) {
31793 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31794 // -X` and some other provable power of 2 patterns that we can use CTZ on
31795 // may be profitable.
31796 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31797 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31798 // be provably a non-zero power of 2.
31799 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31800 // transformable to bittest.
31801 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31802 if (!ShiftVal)
31803 return {nullptr, UndefBit};
31804 if (ShiftVal->equalsInt(1))
31805 BTK = Not ? NotShiftBit : ShiftBit;
31806
31807 if (BTK == UndefBit)
31808 return {nullptr, UndefBit};
31809
31810 Value *BitV = I->getOperand(1);
31811
31812 // Read past a shiftmask instruction to find count
31813 Value *AndOp;
31814 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31815 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31816 BitV = AndOp;
31817
31818 return {BitV, BTK};
31819 }
31820 }
31821 return {nullptr, UndefBit};
31822}
31823
31825X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31826 using namespace llvm::PatternMatch;
31827 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31828 // prefix to a normal instruction for these operations.
31829 if (AI->use_empty())
31831
31832 if (AI->getOperation() == AtomicRMWInst::Xor) {
31833 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31834 // preferable to both `cmpxchg` and `btc`.
31835 if (match(AI->getOperand(1), m_SignMask()))
31837 }
31838
31839 // If the atomicrmw's result is used by a single bit AND, we may use
31840 // bts/btr/btc instruction for these operations.
31841 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31842 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31843 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31844 // detect it.
31845 Instruction *I = AI->user_back();
31846 auto BitChange = FindSingleBitChange(AI->getValOperand());
31847 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31848 I->getOpcode() != Instruction::And ||
31849 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31850 AI->getParent() != I->getParent())
31852
31853 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31854
31855 // This is a redundant AND, it should get cleaned up elsewhere.
31856 if (AI == I->getOperand(OtherIdx))
31858
31859 // The following instruction must be a AND single bit.
31860 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31861 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31862 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31863 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31865 }
31866 if (AI->getOperation() == AtomicRMWInst::And) {
31867 return ~C1->getValue() == C2->getValue()
31870 }
31873 }
31874
31875 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31876
31877 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31878 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31880
31881 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31882
31883 // If shift amounts are not the same we can't use BitTestIntrinsic.
31884 if (BitChange.first != BitTested.first)
31886
31887 // If atomic AND need to be masking all be one bit and testing the one bit
31888 // unset in the mask.
31889 if (AI->getOperation() == AtomicRMWInst::And)
31890 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31893
31894 // If atomic XOR/OR need to be setting and testing the same bit.
31895 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31898}
31899
31900void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31901 IRBuilder<> Builder(AI);
31902 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31905 switch (AI->getOperation()) {
31906 default:
31907 llvm_unreachable("Unknown atomic operation");
31908 case AtomicRMWInst::Or:
31909 IID_C = Intrinsic::x86_atomic_bts;
31910 IID_I = Intrinsic::x86_atomic_bts_rm;
31911 break;
31912 case AtomicRMWInst::Xor:
31913 IID_C = Intrinsic::x86_atomic_btc;
31914 IID_I = Intrinsic::x86_atomic_btc_rm;
31915 break;
31916 case AtomicRMWInst::And:
31917 IID_C = Intrinsic::x86_atomic_btr;
31918 IID_I = Intrinsic::x86_atomic_btr_rm;
31919 break;
31920 }
31921 Instruction *I = AI->user_back();
31922 LLVMContext &Ctx = AI->getContext();
31923 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31925 Value *Result = nullptr;
31926 auto BitTested = FindSingleBitChange(AI->getValOperand());
31927 assert(BitTested.first != nullptr);
31928
31929 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31930 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31931
31932 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31933 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31934 {Addr, Builder.getInt8(Imm)});
31935 } else {
31936 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31937
31938 Value *SI = BitTested.first;
31939 assert(SI != nullptr);
31940
31941 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31942 // mask it.
31943 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31944 Value *BitPos =
31945 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31946 // Todo(1): In many cases it may be provable that SI is less than
31947 // ShiftBits in which case this mask is unnecessary
31948 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31949 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31950 // favor of just a raw BT{S|R|C}.
31951
31952 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31953 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31954
31955 // If the result is only used for zero/non-zero status then we don't need to
31956 // shift value back. Otherwise do so.
31957 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31958 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31959 if (ICmp->isEquality()) {
31960 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31961 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31962 if (C0 || C1) {
31963 assert(C0 == nullptr || C1 == nullptr);
31964 if ((C0 ? C0 : C1)->isZero())
31965 continue;
31966 }
31967 }
31968 }
31969 Result = Builder.CreateShl(Result, BitPos);
31970 break;
31971 }
31972 }
31973
31974 I->replaceAllUsesWith(Result);
31975 I->eraseFromParent();
31976 AI->eraseFromParent();
31977}
31978
31980 using namespace llvm::PatternMatch;
31981 if (!AI->hasOneUse())
31982 return false;
31983
31984 Value *Op = AI->getOperand(1);
31985 CmpPredicate Pred;
31986 Instruction *I = AI->user_back();
31988 if (Opc == AtomicRMWInst::Add) {
31989 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31990 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31991 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31992 if (match(I->user_back(),
31994 return true;
31995 if (match(I->user_back(),
31997 return true;
31998 }
31999 return false;
32000 }
32001 if (Opc == AtomicRMWInst::Sub) {
32002 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32003 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32004 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32005 if (match(I->user_back(),
32007 return true;
32008 if (match(I->user_back(),
32010 return true;
32011 }
32012 return false;
32013 }
32014 if ((Opc == AtomicRMWInst::Or &&
32016 (Opc == AtomicRMWInst::And &&
32018 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32019 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32020 Pred == CmpInst::ICMP_SLT;
32021 if (match(I->user_back(),
32023 return true;
32024 return false;
32025 }
32026 if (Opc == AtomicRMWInst::Xor) {
32027 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32028 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32029 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32030 if (match(I->user_back(),
32032 return true;
32033 if (match(I->user_back(),
32035 return true;
32036 }
32037 return false;
32038 }
32039
32040 return false;
32041}
32042
32043void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32044 AtomicRMWInst *AI) const {
32045 IRBuilder<> Builder(AI);
32046 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32047 Instruction *TempI = nullptr;
32048 LLVMContext &Ctx = AI->getContext();
32049 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32050 if (!ICI) {
32051 TempI = AI->user_back();
32052 assert(TempI->hasOneUse() && "Must have one use");
32053 ICI = cast<ICmpInst>(TempI->user_back());
32054 }
32056 ICmpInst::Predicate Pred = ICI->getPredicate();
32057 switch (Pred) {
32058 default:
32059 llvm_unreachable("Not supported Pred");
32060 case CmpInst::ICMP_EQ:
32061 CC = X86::COND_E;
32062 break;
32063 case CmpInst::ICMP_NE:
32064 CC = X86::COND_NE;
32065 break;
32066 case CmpInst::ICMP_SLT:
32067 CC = X86::COND_S;
32068 break;
32069 case CmpInst::ICMP_SGT:
32070 CC = X86::COND_NS;
32071 break;
32072 }
32074 switch (AI->getOperation()) {
32075 default:
32076 llvm_unreachable("Unknown atomic operation");
32077 case AtomicRMWInst::Add:
32078 IID = Intrinsic::x86_atomic_add_cc;
32079 break;
32080 case AtomicRMWInst::Sub:
32081 IID = Intrinsic::x86_atomic_sub_cc;
32082 break;
32083 case AtomicRMWInst::Or:
32084 IID = Intrinsic::x86_atomic_or_cc;
32085 break;
32086 case AtomicRMWInst::And:
32087 IID = Intrinsic::x86_atomic_and_cc;
32088 break;
32089 case AtomicRMWInst::Xor:
32090 IID = Intrinsic::x86_atomic_xor_cc;
32091 break;
32092 }
32093 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32095 Value *Call = Builder.CreateIntrinsic(
32096 IID, AI->getType(),
32097 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32098 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32099 ICI->replaceAllUsesWith(Result);
32100 ICI->eraseFromParent();
32101 if (TempI)
32102 TempI->eraseFromParent();
32103 AI->eraseFromParent();
32104}
32105
32107X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32108 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32109 Type *MemType = AI->getType();
32110
32111 // If the operand is too big, we must see if cmpxchg8/16b is available
32112 // and default to library calls otherwise.
32113 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32114 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32116 }
32117
32119 switch (Op) {
32122 case AtomicRMWInst::Add:
32123 case AtomicRMWInst::Sub:
32126 // It's better to use xadd, xsub or xchg for these in other cases.
32128 case AtomicRMWInst::Or:
32129 case AtomicRMWInst::And:
32130 case AtomicRMWInst::Xor:
32133 return shouldExpandLogicAtomicRMWInIR(AI);
32135 case AtomicRMWInst::Max:
32136 case AtomicRMWInst::Min:
32147 default:
32148 // These always require a non-trivial set of data operations on x86. We must
32149 // use a cmpxchg loop.
32151 }
32152}
32153
32154LoadInst *
32155X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32156 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32157 Type *MemType = AI->getType();
32158 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32159 // there is no benefit in turning such RMWs into loads, and it is actually
32160 // harmful as it introduces a mfence.
32161 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32162 return nullptr;
32163
32164 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32165 // lowering available in lowerAtomicArith.
32166 // TODO: push more cases through this path.
32167 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32168 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32169 AI->use_empty())
32170 return nullptr;
32171
32172 IRBuilder<> Builder(AI);
32173 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32174 auto SSID = AI->getSyncScopeID();
32175 // We must restrict the ordering to avoid generating loads with Release or
32176 // ReleaseAcquire orderings.
32178
32179 // Before the load we need a fence. Here is an example lifted from
32180 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32181 // is required:
32182 // Thread 0:
32183 // x.store(1, relaxed);
32184 // r1 = y.fetch_add(0, release);
32185 // Thread 1:
32186 // y.fetch_add(42, acquire);
32187 // r2 = x.load(relaxed);
32188 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32189 // lowered to just a load without a fence. A mfence flushes the store buffer,
32190 // making the optimization clearly correct.
32191 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32192 // otherwise, we might be able to be more aggressive on relaxed idempotent
32193 // rmw. In practice, they do not look useful, so we don't try to be
32194 // especially clever.
32195
32196 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32197 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32198 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32199
32200 // Finally we can emit the atomic load.
32201 LoadInst *Loaded = Builder.CreateAlignedLoad(
32202 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32203 Loaded->setAtomic(Order, SSID);
32204 AI->replaceAllUsesWith(Loaded);
32205 AI->eraseFromParent();
32206 return Loaded;
32207}
32208
32209/// Emit a locked operation on a stack location which does not change any
32210/// memory location, but does involve a lock prefix. Location is chosen to be
32211/// a) very likely accessed only by a single thread to minimize cache traffic,
32212/// and b) definitely dereferenceable. Returns the new Chain result.
32214 const X86Subtarget &Subtarget, SDValue Chain,
32215 const SDLoc &DL) {
32216 // Implementation notes:
32217 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32218 // operations issued by the current processor. As such, the location
32219 // referenced is not relevant for the ordering properties of the instruction.
32220 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32221 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32222 // 2) Using an immediate operand appears to be the best encoding choice
32223 // here since it doesn't require an extra register.
32224 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32225 // is small enough it might just be measurement noise.)
32226 // 4) When choosing offsets, there are several contributing factors:
32227 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32228 // line aligned stack object to improve this case.)
32229 // b) To minimize our chances of introducing a false dependence, we prefer
32230 // to offset the stack usage from TOS slightly.
32231 // c) To minimize concerns about cross thread stack usage - in particular,
32232 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32233 // captures state in the TOS frame and accesses it from many threads -
32234 // we want to use an offset such that the offset is in a distinct cache
32235 // line from the TOS frame.
32236 //
32237 // For a general discussion of the tradeoffs and benchmark results, see:
32238 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32239
32240 auto &MF = DAG.getMachineFunction();
32241 auto &TFL = *Subtarget.getFrameLowering();
32242 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32243
32244 if (Subtarget.is64Bit()) {
32245 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32246 SDValue Ops[] = {
32247 DAG.getRegister(X86::RSP, MVT::i64), // Base
32248 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32249 DAG.getRegister(0, MVT::i64), // Index
32250 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32251 DAG.getRegister(0, MVT::i16), // Segment.
32252 Zero,
32253 Chain};
32254 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32255 MVT::Other, Ops);
32256 return SDValue(Res, 1);
32257 }
32258
32259 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32260 SDValue Ops[] = {
32261 DAG.getRegister(X86::ESP, MVT::i32), // Base
32262 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32263 DAG.getRegister(0, MVT::i32), // Index
32264 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32265 DAG.getRegister(0, MVT::i16), // Segment.
32266 Zero,
32267 Chain
32268 };
32269 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32270 MVT::Other, Ops);
32271 return SDValue(Res, 1);
32272}
32273
32275 SelectionDAG &DAG) {
32276 SDLoc dl(Op);
32277 AtomicOrdering FenceOrdering =
32278 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32279 SyncScope::ID FenceSSID =
32280 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32281
32282 // The only fence that needs an instruction is a sequentially-consistent
32283 // cross-thread fence.
32284 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32285 FenceSSID == SyncScope::System) {
32286 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32287 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32288
32289 SDValue Chain = Op.getOperand(0);
32290 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32291 }
32292
32293 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32294 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32295}
32296
32298 SelectionDAG &DAG) {
32299 MVT T = Op.getSimpleValueType();
32300 SDLoc DL(Op);
32301 unsigned Reg = 0;
32302 unsigned size = 0;
32303 switch(T.SimpleTy) {
32304 default: llvm_unreachable("Invalid value type!");
32305 case MVT::i8: Reg = X86::AL; size = 1; break;
32306 case MVT::i16: Reg = X86::AX; size = 2; break;
32307 case MVT::i32: Reg = X86::EAX; size = 4; break;
32308 case MVT::i64:
32309 assert(Subtarget.is64Bit() && "Node not type legal!");
32310 Reg = X86::RAX; size = 8;
32311 break;
32312 }
32313 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32314 Op.getOperand(2), SDValue());
32315 SDValue Ops[] = { cpIn.getValue(0),
32316 Op.getOperand(1),
32317 Op.getOperand(3),
32318 DAG.getTargetConstant(size, DL, MVT::i8),
32319 cpIn.getValue(1) };
32320 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32321 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32323 Ops, T, MMO);
32324
32325 SDValue cpOut =
32326 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32327 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32328 MVT::i32, cpOut.getValue(2));
32329 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32330
32331 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32332 cpOut, Success, EFLAGS.getValue(1));
32333}
32334
32335// Create MOVMSKB, taking into account whether we need to split for AVX1.
32337 const X86Subtarget &Subtarget) {
32338 MVT InVT = V.getSimpleValueType();
32339
32340 if (InVT == MVT::v64i8) {
32341 SDValue Lo, Hi;
32342 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32343 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32344 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32345 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32346 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32347 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32348 DAG.getConstant(32, DL, MVT::i8));
32349 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32350 }
32351 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32352 SDValue Lo, Hi;
32353 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32354 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32355 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32356 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32357 DAG.getConstant(16, DL, MVT::i8));
32358 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32359 }
32360
32361 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32362}
32363
32364static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32365 SelectionDAG &DAG) {
32366 SDValue Src = Op.getOperand(0);
32367 MVT SrcVT = Src.getSimpleValueType();
32368 MVT DstVT = Op.getSimpleValueType();
32369
32370 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32371 // half to v32i1 and concatenating the result.
32372 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32373 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32374 assert(Subtarget.hasBWI() && "Expected BWI target");
32375 SDLoc dl(Op);
32376 SDValue Lo, Hi;
32377 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32378 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32379 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32380 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32381 }
32382
32383 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32384 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32385 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32386 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32387 SDLoc DL(Op);
32388 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32389 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32390 return DAG.getZExtOrTrunc(V, DL, DstVT);
32391 }
32392
32393 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32394 SrcVT == MVT::i64) && "Unexpected VT!");
32395
32396 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32397 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32398 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32399 // This conversion needs to be expanded.
32400 return SDValue();
32401
32402 SDLoc dl(Op);
32403 if (SrcVT.isVector()) {
32404 // Widen the vector in input in the case of MVT::v2i32.
32405 // Example: from MVT::v2i32 to MVT::v4i32.
32407 SrcVT.getVectorNumElements() * 2);
32408 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32409 DAG.getUNDEF(SrcVT));
32410 } else {
32411 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32412 "Unexpected source type in LowerBITCAST");
32413 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32414 }
32415
32416 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32417 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32418
32419 if (DstVT == MVT::x86mmx)
32420 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32421
32422 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32423 DAG.getVectorIdxConstant(0, dl));
32424}
32425
32426/// Compute the horizontal sum of bytes in V for the elements of VT.
32427///
32428/// Requires V to be a byte vector and VT to be an integer vector type with
32429/// wider elements than V's type. The width of the elements of VT determines
32430/// how many bytes of V are summed horizontally to produce each element of the
32431/// result.
32433 const X86Subtarget &Subtarget,
32434 SelectionDAG &DAG) {
32435 SDLoc DL(V);
32436 MVT ByteVecVT = V.getSimpleValueType();
32437 MVT EltVT = VT.getVectorElementType();
32438 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32439 "Expected value to have byte element type.");
32440 assert(EltVT != MVT::i8 &&
32441 "Horizontal byte sum only makes sense for wider elements!");
32442 unsigned VecSize = VT.getSizeInBits();
32443 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32444
32445 // PSADBW instruction horizontally add all bytes and leave the result in i64
32446 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32447 if (EltVT == MVT::i64) {
32448 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32449 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32450 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32451 return DAG.getBitcast(VT, V);
32452 }
32453
32454 if (EltVT == MVT::i32) {
32455 // We unpack the low half and high half into i32s interleaved with zeros so
32456 // that we can use PSADBW to horizontally sum them. The most useful part of
32457 // this is that it lines up the results of two PSADBW instructions to be
32458 // two v2i64 vectors which concatenated are the 4 population counts. We can
32459 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32460 SDValue Zeros = DAG.getConstant(0, DL, VT);
32461 SDValue V32 = DAG.getBitcast(VT, V);
32462 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32463 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32464
32465 // Do the horizontal sums into two v2i64s.
32466 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32467 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32468 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32469 DAG.getBitcast(ByteVecVT, Low), Zeros);
32470 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32471 DAG.getBitcast(ByteVecVT, High), Zeros);
32472
32473 // Merge them together.
32474 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32475 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32476 DAG.getBitcast(ShortVecVT, Low),
32477 DAG.getBitcast(ShortVecVT, High));
32478
32479 return DAG.getBitcast(VT, V);
32480 }
32481
32482 // The only element type left is i16.
32483 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32484
32485 // To obtain pop count for each i16 element starting from the pop count for
32486 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32487 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32488 // directly supported.
32489 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32490 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32491 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32492 DAG.getBitcast(ByteVecVT, V));
32493 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32494}
32495
32497 const X86Subtarget &Subtarget,
32498 SelectionDAG &DAG) {
32499 MVT VT = Op.getSimpleValueType();
32500 MVT EltVT = VT.getVectorElementType();
32501 int NumElts = VT.getVectorNumElements();
32502 (void)EltVT;
32503 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32504
32505 // Implement a lookup table in register by using an algorithm based on:
32506 // http://wm.ite.pl/articles/sse-popcount.html
32507 //
32508 // The general idea is that every lower byte nibble in the input vector is an
32509 // index into a in-register pre-computed pop count table. We then split up the
32510 // input vector in two new ones: (1) a vector with only the shifted-right
32511 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32512 // masked out higher ones) for each byte. PSHUFB is used separately with both
32513 // to index the in-register table. Next, both are added and the result is a
32514 // i8 vector where each element contains the pop count for input byte.
32515 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32516 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32517 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32518 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32519
32521 for (int i = 0; i < NumElts; ++i)
32522 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32523 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32524 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32525
32526 // High nibbles
32527 SDValue FourV = DAG.getConstant(4, DL, VT);
32528 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32529
32530 // Low nibbles
32531 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32532
32533 // The input vector is used as the shuffle mask that index elements into the
32534 // LUT. After counting low and high nibbles, add the vector to obtain the
32535 // final pop count per i8 element.
32536 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32537 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32538 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32539}
32540
32541// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32542// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32544 const X86Subtarget &Subtarget,
32545 SelectionDAG &DAG) {
32546 MVT VT = Op.getSimpleValueType();
32547 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32548 "Unknown CTPOP type to handle");
32549 SDValue Op0 = Op.getOperand(0);
32550
32551 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32552 if (Subtarget.hasVPOPCNTDQ()) {
32553 unsigned NumElems = VT.getVectorNumElements();
32554 assert((VT.getVectorElementType() == MVT::i8 ||
32555 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32556 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32557 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32558 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32559 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32560 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32561 }
32562 }
32563
32564 // Decompose 256-bit ops into smaller 128-bit ops.
32565 if (VT.is256BitVector() && !Subtarget.hasInt256())
32566 return splitVectorIntUnary(Op, DAG, DL);
32567
32568 // Decompose 512-bit ops into smaller 256-bit ops.
32569 if (VT.is512BitVector() && !Subtarget.hasBWI())
32570 return splitVectorIntUnary(Op, DAG, DL);
32571
32572 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32573 if (VT.getScalarType() != MVT::i8) {
32574 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32575 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32576 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32577 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32578 }
32579
32580 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32581 if (!Subtarget.hasSSSE3())
32582 return SDValue();
32583
32584 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32585}
32586
32587static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32588 SelectionDAG &DAG) {
32589 MVT VT = N.getSimpleValueType();
32590 SDValue Op = N.getOperand(0);
32591 SDLoc DL(N);
32592
32593 if (VT.isScalarInteger()) {
32594 // Compute the lower/upper bounds of the active bits of the value,
32595 // allowing us to shift the active bits down if necessary to fit into the
32596 // special cases below.
32597 KnownBits Known = DAG.computeKnownBits(Op);
32598 if (Known.isConstant())
32599 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32600 unsigned LZ = Known.countMinLeadingZeros();
32601 unsigned TZ = Known.countMinTrailingZeros();
32602 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32603 unsigned ActiveBits = Known.getBitWidth() - LZ;
32604 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32605
32606 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32607 if (ShiftedActiveBits <= 2) {
32608 if (ActiveBits > 2)
32609 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32610 DAG.getShiftAmountConstant(TZ, VT, DL));
32611 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32612 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32613 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32614 DAG.getShiftAmountConstant(1, VT, DL)));
32615 return DAG.getZExtOrTrunc(Op, DL, VT);
32616 }
32617
32618 // i3 CTPOP - perform LUT into i32 integer.
32619 if (ShiftedActiveBits <= 3) {
32620 if (ActiveBits > 3)
32621 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32622 DAG.getShiftAmountConstant(TZ, VT, DL));
32623 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32624 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32625 DAG.getShiftAmountConstant(1, VT, DL));
32626 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32627 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32628 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32629 DAG.getConstant(0x3, DL, MVT::i32));
32630 return DAG.getZExtOrTrunc(Op, DL, VT);
32631 }
32632
32633 // i4 CTPOP - perform LUT into i64 integer.
32634 if (ShiftedActiveBits <= 4 &&
32635 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32636 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32637 if (ActiveBits > 4)
32638 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32639 DAG.getShiftAmountConstant(TZ, VT, DL));
32640 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32641 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32642 DAG.getConstant(4, DL, MVT::i32));
32643 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32644 DAG.getShiftAmountOperand(MVT::i64, Op));
32645 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32646 DAG.getConstant(0x7, DL, MVT::i64));
32647 return DAG.getZExtOrTrunc(Op, DL, VT);
32648 }
32649
32650 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32651 if (ShiftedActiveBits <= 8) {
32652 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32653 if (ActiveBits > 8)
32654 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32655 DAG.getShiftAmountConstant(TZ, VT, DL));
32656 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32657 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32658 DAG.getConstant(0x08040201U, DL, MVT::i32));
32659 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32660 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32661 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32662 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32663 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32664 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32665 return DAG.getZExtOrTrunc(Op, DL, VT);
32666 }
32667
32668 return SDValue(); // fallback to generic expansion.
32669 }
32670
32671 assert(VT.isVector() &&
32672 "We only do custom lowering for vector population count.");
32673 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32674}
32675
32677 MVT VT = Op.getSimpleValueType();
32678 SDValue In = Op.getOperand(0);
32679 SDLoc DL(Op);
32680
32681 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32682 // perform the BITREVERSE.
32683 if (!VT.isVector()) {
32684 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32685 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32686 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32687 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32688 DAG.getVectorIdxConstant(0, DL));
32689 }
32690
32691 int NumElts = VT.getVectorNumElements();
32692 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32693
32694 // Decompose 256-bit ops into smaller 128-bit ops.
32695 if (VT.is256BitVector())
32696 return splitVectorIntUnary(Op, DAG, DL);
32697
32698 assert(VT.is128BitVector() &&
32699 "Only 128-bit vector bitreverse lowering supported.");
32700
32701 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32702 // perform the BSWAP in the shuffle.
32703 // Its best to shuffle using the second operand as this will implicitly allow
32704 // memory folding for multiple vectors.
32705 SmallVector<SDValue, 16> MaskElts;
32706 for (int i = 0; i != NumElts; ++i) {
32707 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32708 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32709 int PermuteByte = SourceByte | (2 << 5);
32710 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32711 }
32712 }
32713
32714 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32715 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32716 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32717 Res, Mask);
32718 return DAG.getBitcast(VT, Res);
32719}
32720
32722 SelectionDAG &DAG) {
32723 MVT VT = Op.getSimpleValueType();
32724
32725 if (Subtarget.hasXOP() && !VT.is512BitVector())
32726 return LowerBITREVERSE_XOP(Op, DAG);
32727
32728 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32729 "SSSE3 or GFNI required for BITREVERSE");
32730
32731 SDValue In = Op.getOperand(0);
32732 SDLoc DL(Op);
32733
32734 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32735 if (VT.is512BitVector() && !Subtarget.hasBWI())
32736 return splitVectorIntUnary(Op, DAG, DL);
32737
32738 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32739 if (VT.is256BitVector() && !Subtarget.hasInt256())
32740 return splitVectorIntUnary(Op, DAG, DL);
32741
32742 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32743 if (!VT.isVector()) {
32744 assert(
32745 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32746 "Only tested for i8/i16/i32/i64");
32747 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32748 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32749 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32750 DAG.getBitcast(MVT::v16i8, Res));
32751 Res =
32752 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32753 DAG.getVectorIdxConstant(0, DL));
32754 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32755 }
32756
32757 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32758
32759 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32760 if (VT.getScalarType() != MVT::i8) {
32761 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32762 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32763 Res = DAG.getBitcast(ByteVT, Res);
32764 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32765 return DAG.getBitcast(VT, Res);
32766 }
32767 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32768 "Only byte vector BITREVERSE supported");
32769
32770 unsigned NumElts = VT.getVectorNumElements();
32771
32772 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32773 if (Subtarget.hasGFNI()) {
32775 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32776 DAG.getTargetConstant(0, DL, MVT::i8));
32777 }
32778
32779 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32780 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32781 // 0-15 value (moved to the other nibble).
32782 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32783 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32784 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32785
32786 const int LoLUT[16] = {
32787 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32788 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32789 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32790 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32791 const int HiLUT[16] = {
32792 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32793 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32794 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32795 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32796
32797 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32798 for (unsigned i = 0; i < NumElts; ++i) {
32799 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32800 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32801 }
32802
32803 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32804 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32805 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32806 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32807 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32808}
32809
32810static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32811 SelectionDAG &DAG) {
32812 SDLoc DL(Op);
32813 SDValue X = Op.getOperand(0);
32814 MVT VT = Op.getSimpleValueType();
32815
32816 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32817 if (VT == MVT::i8 ||
32819 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32820 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32821 DAG.getConstant(0, DL, MVT::i8));
32822 // Copy the inverse of the parity flag into a register with setcc.
32823 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32824 // Extend to the original type.
32825 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32826 }
32827
32828 // If we have POPCNT, use the default expansion.
32829 if (Subtarget.hasPOPCNT())
32830 return SDValue();
32831
32832 if (VT == MVT::i64) {
32833 // Xor the high and low 16-bits together using a 32-bit operation.
32834 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32835 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32836 DAG.getConstant(32, DL, MVT::i8)));
32837 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32838 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32839 }
32840
32841 if (VT != MVT::i16) {
32842 // Xor the high and low 16-bits together using a 32-bit operation.
32843 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32844 DAG.getConstant(16, DL, MVT::i8));
32845 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32846 } else {
32847 // If the input is 16-bits, we need to extend to use an i32 shift below.
32848 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32849 }
32850
32851 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32852 // This should allow an h-reg to be used to save a shift.
32853 SDValue Hi = DAG.getNode(
32854 ISD::TRUNCATE, DL, MVT::i8,
32855 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32856 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32857 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32858 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32859
32860 // Copy the inverse of the parity flag into a register with setcc.
32861 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32862 // Extend to the original type.
32863 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32864}
32865
32867 const X86Subtarget &Subtarget) {
32868 unsigned NewOpc = 0;
32869 switch (N->getOpcode()) {
32871 NewOpc = X86ISD::LADD;
32872 break;
32874 NewOpc = X86ISD::LSUB;
32875 break;
32877 NewOpc = X86ISD::LOR;
32878 break;
32880 NewOpc = X86ISD::LXOR;
32881 break;
32883 NewOpc = X86ISD::LAND;
32884 break;
32885 default:
32886 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32887 }
32888
32889 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32890
32891 return DAG.getMemIntrinsicNode(
32892 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32893 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32894 /*MemVT=*/N->getSimpleValueType(0), MMO);
32895}
32896
32897/// Lower atomic_load_ops into LOCK-prefixed operations.
32899 const X86Subtarget &Subtarget) {
32900 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32901 SDValue Chain = N->getOperand(0);
32902 SDValue LHS = N->getOperand(1);
32903 SDValue RHS = N->getOperand(2);
32904 unsigned Opc = N->getOpcode();
32905 MVT VT = N->getSimpleValueType(0);
32906 SDLoc DL(N);
32907
32908 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32909 // can only be lowered when the result is unused. They should have already
32910 // been transformed into a cmpxchg loop in AtomicExpand.
32911 if (N->hasAnyUseOfValue(0)) {
32912 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32913 // select LXADD if LOCK_SUB can't be selected.
32914 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32915 // can use LXADD as opposed to cmpxchg.
32916 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32918 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32919 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32920
32922 "Used AtomicRMW ops other than Add should have been expanded!");
32923 return N;
32924 }
32925
32926 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32927 // The core idea here is that since the memory location isn't actually
32928 // changing, all we need is a lowering for the *ordering* impacts of the
32929 // atomicrmw. As such, we can chose a different operation and memory
32930 // location to minimize impact on other code.
32931 // The above holds unless the node is marked volatile in which
32932 // case it needs to be preserved according to the langref.
32933 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32934 // On X86, the only ordering which actually requires an instruction is
32935 // seq_cst which isn't SingleThread, everything just needs to be preserved
32936 // during codegen and then dropped. Note that we expect (but don't assume),
32937 // that orderings other than seq_cst and acq_rel have been canonicalized to
32938 // a store or load.
32941 // Prefer a locked operation against a stack location to minimize cache
32942 // traffic. This assumes that stack locations are very likely to be
32943 // accessed only by the owning thread.
32944 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32945 assert(!N->hasAnyUseOfValue(0));
32946 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32947 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32948 DAG.getUNDEF(VT), NewChain);
32949 }
32950 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32951 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32952 assert(!N->hasAnyUseOfValue(0));
32953 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32954 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32955 DAG.getUNDEF(VT), NewChain);
32956 }
32957
32958 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32959 // RAUW the chain, but don't worry about the result, as it's unused.
32960 assert(!N->hasAnyUseOfValue(0));
32961 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32962 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32963 DAG.getUNDEF(VT), LockOp.getValue(1));
32964}
32965
32967 const X86Subtarget &Subtarget) {
32968 auto *Node = cast<AtomicSDNode>(Op.getNode());
32969 SDLoc dl(Node);
32970 EVT VT = Node->getMemoryVT();
32971
32972 bool IsSeqCst =
32973 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32974 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32975
32976 // If this store is not sequentially consistent and the type is legal
32977 // we can just keep it.
32978 if (!IsSeqCst && IsTypeLegal)
32979 return Op;
32980
32981 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32983 Attribute::NoImplicitFloat)) {
32984 SDValue Chain;
32985 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32986 // vector store.
32987 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32988 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32989 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32990 Node->getMemOperand());
32991 }
32992
32993 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32994 // is enabled.
32995 if (VT == MVT::i64) {
32996 if (Subtarget.hasSSE1()) {
32997 SDValue SclToVec =
32998 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
32999 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33000 SclToVec = DAG.getBitcast(StVT, SclToVec);
33001 SDVTList Tys = DAG.getVTList(MVT::Other);
33002 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33003 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33004 MVT::i64, Node->getMemOperand());
33005 } else if (Subtarget.hasX87()) {
33006 // First load this into an 80-bit X87 register using a stack temporary.
33007 // This will put the whole integer into the significand.
33008 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33009 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33010 MachinePointerInfo MPI =
33012 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33014 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33015 SDValue LdOps[] = {Chain, StackPtr};
33017 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33018 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33019 Chain = Value.getValue(1);
33020
33021 // Now use an FIST to do the atomic store.
33022 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33023 Chain =
33024 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33025 StoreOps, MVT::i64, Node->getMemOperand());
33026 }
33027 }
33028
33029 if (Chain) {
33030 // If this is a sequentially consistent store, also emit an appropriate
33031 // barrier.
33032 if (IsSeqCst)
33033 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33034
33035 return Chain;
33036 }
33037 }
33038
33039 // Convert seq_cst store -> xchg
33040 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33041 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33042 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33043 Node->getOperand(0), Node->getOperand(2),
33044 Node->getOperand(1), Node->getMemOperand());
33045 return Swap.getValue(1);
33046}
33047
33049 SDNode *N = Op.getNode();
33050 MVT VT = N->getSimpleValueType(0);
33051 unsigned Opc = Op.getOpcode();
33052
33053 // Let legalize expand this if it isn't a legal type yet.
33054 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33055 return SDValue();
33056
33057 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33058 SDLoc DL(N);
33059
33060 // Set the carry flag.
33061 SDValue Carry = Op.getOperand(2);
33062 EVT CarryVT = Carry.getValueType();
33063 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33064 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33065
33066 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33067 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33068 Op.getOperand(0), Op.getOperand(1),
33069 Carry.getValue(1));
33070
33071 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33072 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33073 Sum.getValue(1), DL, DAG);
33074 if (N->getValueType(1) == MVT::i1)
33075 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33076
33077 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33078}
33079
33080static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33081 SelectionDAG &DAG) {
33082 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33083
33084 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33085 // which returns the values as { float, float } (in XMM0) or
33086 // { double, double } (which is returned in XMM0, XMM1).
33087 SDLoc dl(Op);
33088 SDValue Arg = Op.getOperand(0);
33089 EVT ArgVT = Arg.getValueType();
33090 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33091
33093 Args.emplace_back(Arg, ArgTy);
33094
33095 bool isF64 = ArgVT == MVT::f64;
33096 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33097 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33098 // the results are returned via SRet in memory.
33099 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33100 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33101 const char *LibcallName = TLI.getLibcallName(LC);
33102 SDValue Callee =
33103 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33104
33105 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33106 : (Type *)FixedVectorType::get(ArgTy, 4);
33107
33109 CLI.setDebugLoc(dl)
33110 .setChain(DAG.getEntryNode())
33111 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33112
33113 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33114
33115 if (isF64)
33116 // Returned in xmm0 and xmm1.
33117 return CallResult.first;
33118
33119 // Returned in bits 0:31 and 32:64 xmm0.
33120 SDValue SinVal =
33121 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33122 DAG.getVectorIdxConstant(0, dl));
33123 SDValue CosVal =
33124 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33125 DAG.getVectorIdxConstant(1, dl));
33126 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33127 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33128}
33129
33130/// Widen a vector input to a vector of NVT. The
33131/// input vector must have the same element type as NVT.
33133 bool FillWithZeroes = false) {
33134 // Check if InOp already has the right width.
33135 MVT InVT = InOp.getSimpleValueType();
33136 if (InVT == NVT)
33137 return InOp;
33138
33139 if (InOp.isUndef())
33140 return DAG.getUNDEF(NVT);
33141
33143 "input and widen element type must match");
33144
33145 unsigned InNumElts = InVT.getVectorNumElements();
33146 unsigned WidenNumElts = NVT.getVectorNumElements();
33147 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33148 "Unexpected request for vector widening");
33149
33150 SDLoc dl(InOp);
33151 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33152 SDValue N1 = InOp.getOperand(1);
33153 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33154 N1.isUndef()) {
33155 InOp = InOp.getOperand(0);
33156 InVT = InOp.getSimpleValueType();
33157 InNumElts = InVT.getVectorNumElements();
33158 }
33159 }
33162 EVT EltVT = InOp.getOperand(0).getValueType();
33163 SDValue FillVal =
33164 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33165 SmallVector<SDValue, 16> Ops(InOp->ops());
33166 Ops.append(WidenNumElts - InNumElts, FillVal);
33167 return DAG.getBuildVector(NVT, dl, Ops);
33168 }
33169 SDValue FillVal =
33170 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33171 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33172 DAG.getVectorIdxConstant(0, dl));
33173}
33174
33176 SelectionDAG &DAG) {
33177 assert(Subtarget.hasAVX512() &&
33178 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33179
33180 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
33181 SDValue Src = N->getValue();
33182 MVT VT = Src.getSimpleValueType();
33183 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33184 SDLoc dl(Op);
33185
33186 SDValue Scale = N->getScale();
33187 SDValue Index = N->getIndex();
33188 SDValue Mask = N->getMask();
33189 SDValue Chain = N->getChain();
33190 SDValue BasePtr = N->getBasePtr();
33191
33192 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33193 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33194 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33195 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33196 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33197 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33198 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33199 SDVTList VTs = DAG.getVTList(MVT::Other);
33200 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33201 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33202 N->getMemoryVT(), N->getMemOperand());
33203 }
33204 return SDValue();
33205 }
33206
33207 MVT IndexVT = Index.getSimpleValueType();
33208
33209 // If the index is v2i32, we're being called by type legalization and we
33210 // should just let the default handling take care of it.
33211 if (IndexVT == MVT::v2i32)
33212 return SDValue();
33213
33214 // If we don't have VLX and neither the passthru or index is 512-bits, we
33215 // need to widen until one is.
33216 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33217 !Index.getSimpleValueType().is512BitVector()) {
33218 // Determine how much we need to widen by to get a 512-bit type.
33219 unsigned Factor = std::min(512/VT.getSizeInBits(),
33220 512/IndexVT.getSizeInBits());
33221 unsigned NumElts = VT.getVectorNumElements() * Factor;
33222
33223 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33224 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33225 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33226
33227 Src = ExtendToType(Src, VT, DAG);
33228 Index = ExtendToType(Index, IndexVT, DAG);
33229 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33230 }
33231
33232 SDVTList VTs = DAG.getVTList(MVT::Other);
33233 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33234 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33235 N->getMemoryVT(), N->getMemOperand());
33236}
33237
33238static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33239 SelectionDAG &DAG) {
33240
33241 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
33242 MVT VT = Op.getSimpleValueType();
33243 MVT ScalarVT = VT.getScalarType();
33244 SDValue Mask = N->getMask();
33245 MVT MaskVT = Mask.getSimpleValueType();
33246 SDValue PassThru = N->getPassThru();
33247 SDLoc dl(Op);
33248
33249 // Handle AVX masked loads which don't support passthru other than 0.
33250 if (MaskVT.getVectorElementType() != MVT::i1) {
33251 // We also allow undef in the isel pattern.
33252 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33253 return Op;
33254
33255 SDValue NewLoad = DAG.getMaskedLoad(
33256 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33257 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33258 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33259 N->isExpandingLoad());
33260 // Emit a blend.
33261 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33262 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33263 }
33264
33265 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33266 "Expanding masked load is supported on AVX-512 target only!");
33267
33268 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33269 "Expanding masked load is supported for 32 and 64-bit types only!");
33270
33271 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33272 "Cannot lower masked load op.");
33273
33274 assert((ScalarVT.getSizeInBits() >= 32 ||
33275 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33276 ScalarVT == MVT::f16))) &&
33277 "Unsupported masked load op.");
33278
33279 // This operation is legal for targets with VLX, but without
33280 // VLX the vector should be widened to 512 bit
33281 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33282 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33283 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33284
33285 // Mask element has to be i1.
33286 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33287 "Unexpected mask type");
33288
33289 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33290
33291 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33292 SDValue NewLoad = DAG.getMaskedLoad(
33293 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33294 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33295 N->getExtensionType(), N->isExpandingLoad());
33296
33297 SDValue Extract =
33298 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33299 DAG.getVectorIdxConstant(0, dl));
33300 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33301 return DAG.getMergeValues(RetOps, dl);
33302}
33303
33304static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33305 SelectionDAG &DAG) {
33306 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
33307 SDValue DataToStore = N->getValue();
33308 MVT VT = DataToStore.getSimpleValueType();
33309 MVT ScalarVT = VT.getScalarType();
33310 SDValue Mask = N->getMask();
33311 SDLoc dl(Op);
33312
33313 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33314 "Expanding masked load is supported on AVX-512 target only!");
33315
33316 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33317 "Expanding masked load is supported for 32 and 64-bit types only!");
33318
33319 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33320 "Cannot lower masked store op.");
33321
33322 assert((ScalarVT.getSizeInBits() >= 32 ||
33323 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33324 ScalarVT == MVT::f16))) &&
33325 "Unsupported masked store op.");
33326
33327 // This operation is legal for targets with VLX, but without
33328 // VLX the vector should be widened to 512 bit
33329 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33330 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33331
33332 // Mask element has to be i1.
33333 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33334 "Unexpected mask type");
33335
33336 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33337
33338 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33339 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33340 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33341 N->getOffset(), Mask, N->getMemoryVT(),
33342 N->getMemOperand(), N->getAddressingMode(),
33343 N->isTruncatingStore(), N->isCompressingStore());
33344}
33345
33346static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33347 SelectionDAG &DAG) {
33348 assert(Subtarget.hasAVX2() &&
33349 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33350
33351 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33352 SDLoc dl(Op);
33353 MVT VT = Op.getSimpleValueType();
33354 SDValue Index = N->getIndex();
33355 SDValue Mask = N->getMask();
33356 SDValue PassThru = N->getPassThru();
33357 MVT IndexVT = Index.getSimpleValueType();
33358
33359 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33360
33361 // If the index is v2i32, we're being called by type legalization.
33362 if (IndexVT == MVT::v2i32)
33363 return SDValue();
33364
33365 // If we don't have VLX and neither the passthru or index is 512-bits, we
33366 // need to widen until one is.
33367 MVT OrigVT = VT;
33368 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33369 !IndexVT.is512BitVector()) {
33370 // Determine how much we need to widen by to get a 512-bit type.
33371 unsigned Factor = std::min(512/VT.getSizeInBits(),
33372 512/IndexVT.getSizeInBits());
33373
33374 unsigned NumElts = VT.getVectorNumElements() * Factor;
33375
33376 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33377 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33378 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33379
33380 PassThru = ExtendToType(PassThru, VT, DAG);
33381 Index = ExtendToType(Index, IndexVT, DAG);
33382 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33383 }
33384
33385 // Break dependency on the data register.
33386 if (PassThru.isUndef())
33387 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33388
33389 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33390 N->getScale() };
33391 SDValue NewGather = DAG.getMemIntrinsicNode(
33392 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33393 N->getMemOperand());
33394 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33395 DAG.getVectorIdxConstant(0, dl));
33396 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33397}
33398
33400 SDLoc dl(Op);
33401 SDValue Src = Op.getOperand(0);
33402 MVT DstVT = Op.getSimpleValueType();
33403
33404 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33405 unsigned SrcAS = N->getSrcAddressSpace();
33406
33407 assert(SrcAS != N->getDestAddressSpace() &&
33408 "addrspacecast must be between different address spaces");
33409
33410 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33411 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33412 } else if (DstVT == MVT::i64) {
33413 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33414 } else if (DstVT == MVT::i32) {
33415 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33416 } else {
33417 report_fatal_error("Bad address space in addrspacecast");
33418 }
33419 return Op;
33420}
33421
33422SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33423 SelectionDAG &DAG) const {
33424 // TODO: Eventually, the lowering of these nodes should be informed by or
33425 // deferred to the GC strategy for the function in which they appear. For
33426 // now, however, they must be lowered to something. Since they are logically
33427 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33428 // require special handling for these nodes), lower them as literal NOOPs for
33429 // the time being.
33431 Ops.push_back(Op.getOperand(0));
33432 if (Op->getGluedNode())
33433 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33434
33435 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33436 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33437}
33438
33439// Custom split CVTPS2PH with wide types.
33441 SDLoc dl(Op);
33442 EVT VT = Op.getValueType();
33443 SDValue Lo, Hi;
33444 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33445 EVT LoVT, HiVT;
33446 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33447 SDValue RC = Op.getOperand(1);
33448 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33449 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33450 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33451}
33452
33454 SelectionDAG &DAG) {
33455 unsigned IsData = Op.getConstantOperandVal(4);
33456
33457 // We don't support non-data prefetch without PREFETCHI.
33458 // Just preserve the chain.
33459 if (!IsData && !Subtarget.hasPREFETCHI())
33460 return Op.getOperand(0);
33461
33462 return Op;
33463}
33464
33466 SDNode *N = Op.getNode();
33467 SDValue Operand = N->getOperand(0);
33468 EVT VT = Operand.getValueType();
33469 SDLoc dl(N);
33470
33471 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33472
33473 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33474 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33475 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33476 // promote this operator's result!
33477 SDValue Chain = DAG.getEntryNode();
33478 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33479 {Chain, Operand, One});
33480 return StrictFmul;
33481}
33482
33484 unsigned OpNo) {
33485 const APInt Operand(32, OpNo);
33486 std::string OpNoStr = llvm::toString(Operand, 10, false);
33487 std::string Str(" $");
33488
33489 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33490 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33491
33492 auto I = StringRef::npos;
33493 for (auto &AsmStr : AsmStrs) {
33494 // Match the OpNo string. We should match exactly to exclude match
33495 // sub-string, e.g. "$12" contain "$1"
33496 if (AsmStr.ends_with(OpNoStr1))
33497 I = AsmStr.size() - OpNoStr1.size();
33498
33499 // Get the index of operand in AsmStr.
33500 if (I == StringRef::npos)
33501 I = AsmStr.find(OpNoStr1 + ",");
33502 if (I == StringRef::npos)
33503 I = AsmStr.find(OpNoStr2);
33504
33505 if (I == StringRef::npos)
33506 continue;
33507
33508 assert(I > 0 && "Unexpected inline asm string!");
33509 // Remove the operand string and label (if exsit).
33510 // For example:
33511 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33512 // ==>
33513 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33514 // ==>
33515 // "call dword ptr "
33516 auto TmpStr = AsmStr.substr(0, I);
33517 I = TmpStr.rfind(':');
33518 if (I != StringRef::npos)
33519 TmpStr = TmpStr.substr(I + 1);
33520 return TmpStr.take_while(llvm::isAlpha);
33521 }
33522
33523 return StringRef();
33524}
33525
33527 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33528 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33529 // changed from indirect TargetLowering::C_Memory to direct
33530 // TargetLowering::C_Address.
33531 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33532 // location.
33533 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33534 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33535}
33536
33538 SDValue Mask) {
33539 EVT Ty = MVT::i8;
33540 auto V = DAG.getBitcast(MVT::i1, Mask);
33541 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33542 auto Zero = DAG.getConstant(0, DL, Ty);
33543 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33544 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33545 return SDValue(CmpZero.getNode(), 1);
33546}
33547
33549 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33550 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33551 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33552 // ->
33553 // _, flags = SUB 0, mask
33554 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33555 // bit_cast_to_vector<res>
33556 EVT VTy = PassThru.getValueType();
33557 EVT Ty = VTy.getVectorElementType();
33558 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33559 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33560 : DAG.getBitcast(Ty, PassThru);
33561 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33562 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33563 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33564 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33565 return DAG.getBitcast(VTy, NewLoad);
33566}
33567
33569 SDValue Chain,
33571 SDValue Val, SDValue Mask) const {
33572 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33573 // ->
33574 // _, flags = SUB 0, mask
33575 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33577 SDVTList Tys = DAG.getVTList(MVT::Other);
33578 auto ScalarVal = DAG.getBitcast(Ty, Val);
33579 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33580 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33581 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33582 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33583}
33584
33585/// Provide custom lowering hooks for some operations.
33587 switch (Op.getOpcode()) {
33588 // clang-format off
33589 default: llvm_unreachable("Should not custom lower this!");
33590 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33592 return LowerCMP_SWAP(Op, Subtarget, DAG);
33593 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33598 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33599 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33600 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33601 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33602 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33603 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33604 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33605 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33606 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33607 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33608 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33609 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33610 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33611 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33612 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33613 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33614 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33615 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33616 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33617 case ISD::SHL_PARTS:
33618 case ISD::SRA_PARTS:
33619 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33620 case ISD::FSHL:
33621 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33622 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33624 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33626 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33627 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33628 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33629 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33630 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33633 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33634 case ISD::FP_TO_SINT:
33636 case ISD::FP_TO_UINT:
33637 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33639 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33640 case ISD::FP_EXTEND:
33641 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33642 case ISD::FP_ROUND:
33643 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33644 case ISD::FP16_TO_FP:
33645 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33646 case ISD::FP_TO_FP16:
33647 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33648 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33649 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33650 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33651 case ISD::FADD:
33652 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33653 case ISD::FROUND: return LowerFROUND(Op, DAG);
33654 case ISD::FABS:
33655 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33656 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33657 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33658 case ISD::LRINT:
33659 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33660 case ISD::SETCC:
33661 case ISD::STRICT_FSETCC:
33662 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33663 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33664 case ISD::SELECT: return LowerSELECT(Op, DAG);
33665 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33666 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33667 case ISD::VASTART: return LowerVASTART(Op, DAG);
33668 case ISD::VAARG: return LowerVAARG(Op, DAG);
33669 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33670 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33672 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33673 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33674 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33675 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33677 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33678 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33679 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33680 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33681 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33683 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33684 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33686 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33687 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33688 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33689 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33690 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33691 case ISD::CTLZ:
33692 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33693 case ISD::CTTZ:
33694 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33695 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33696 case ISD::MULHS:
33697 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33698 case ISD::ROTL:
33699 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33700 case ISD::SRA:
33701 case ISD::SRL:
33702 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33703 case ISD::SADDO:
33704 case ISD::UADDO:
33705 case ISD::SSUBO:
33706 case ISD::USUBO: return LowerXALUO(Op, DAG);
33707 case ISD::SMULO:
33708 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33709 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33710 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33711 case ISD::SADDO_CARRY:
33712 case ISD::SSUBO_CARRY:
33713 case ISD::UADDO_CARRY:
33714 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33715 case ISD::ADD:
33716 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33717 case ISD::UADDSAT:
33718 case ISD::SADDSAT:
33719 case ISD::USUBSAT:
33720 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33721 case ISD::SMAX:
33722 case ISD::SMIN:
33723 case ISD::UMAX:
33724 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33725 case ISD::FMINIMUM:
33726 case ISD::FMAXIMUM:
33727 case ISD::FMINIMUMNUM:
33728 case ISD::FMAXIMUMNUM:
33729 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33730 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33731 case ISD::ABDS:
33732 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33733 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33734 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33735 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33736 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33737 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33738 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33740 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33741 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33742 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33743 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33744 // clang-format on
33745 }
33746}
33747
33748/// Replace a node with an illegal result type with a new node built out of
33749/// custom code.
33752 SelectionDAG &DAG) const {
33753 SDLoc dl(N);
33754 unsigned Opc = N->getOpcode();
33755 switch (Opc) {
33756 default:
33757#ifndef NDEBUG
33758 dbgs() << "ReplaceNodeResults: ";
33759 N->dump(&DAG);
33760#endif
33761 llvm_unreachable("Do not know how to custom type legalize this operation!");
33762 case X86ISD::CVTPH2PS: {
33763 EVT VT = N->getValueType(0);
33764 SDValue Lo, Hi;
33765 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33766 EVT LoVT, HiVT;
33767 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33768 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33769 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33770 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33771 Results.push_back(Res);
33772 return;
33773 }
33775 EVT VT = N->getValueType(0);
33776 SDValue Lo, Hi;
33777 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33778 EVT LoVT, HiVT;
33779 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33780 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33781 {N->getOperand(0), Lo});
33782 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33783 {N->getOperand(0), Hi});
33784 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33785 Lo.getValue(1), Hi.getValue(1));
33786 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33787 Results.push_back(Res);
33788 Results.push_back(Chain);
33789 return;
33790 }
33791 case X86ISD::CVTPS2PH:
33792 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33793 return;
33794 case ISD::CTPOP: {
33795 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33796 // If we have at most 32 active bits, then perform as i32 CTPOP.
33797 // TODO: Perform this in generic legalizer?
33798 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33799 unsigned LZ = Known.countMinLeadingZeros();
33800 unsigned TZ = Known.countMinTrailingZeros();
33801 if ((LZ + TZ) >= 32) {
33802 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33803 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33804 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33805 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33806 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33807 Results.push_back(Op);
33808 return;
33809 }
33810 // Use a v2i64 if possible.
33811 bool NoImplicitFloatOps =
33813 Attribute::NoImplicitFloat);
33814 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33815 SDValue Wide =
33816 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33817 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33818 // Bit count should fit in 32-bits, extract it as that and then zero
33819 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33820 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33821 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33822 DAG.getVectorIdxConstant(0, dl));
33823 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33824 Results.push_back(Wide);
33825 }
33826 return;
33827 }
33828 case ISD::MUL: {
33829 EVT VT = N->getValueType(0);
33831 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33832 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33833 // elements are needed.
33834 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33835 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33836 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33837 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33838 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33839 unsigned NumConcats = 16 / VT.getVectorNumElements();
33840 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33841 ConcatOps[0] = Res;
33842 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33843 Results.push_back(Res);
33844 return;
33845 }
33846 case ISD::SMULO:
33847 case ISD::UMULO: {
33848 EVT VT = N->getValueType(0);
33850 VT == MVT::v2i32 && "Unexpected VT!");
33851 bool IsSigned = Opc == ISD::SMULO;
33852 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33853 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33854 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33855 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33856 // Extract the high 32 bits from each result using PSHUFD.
33857 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33858 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33859 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33860 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33861 DAG.getVectorIdxConstant(0, dl));
33862
33863 // Truncate the low bits of the result. This will become PSHUFD.
33864 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33865
33866 SDValue HiCmp;
33867 if (IsSigned) {
33868 // SMULO overflows if the high bits don't match the sign of the low.
33869 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33870 } else {
33871 // UMULO overflows if the high bits are non-zero.
33872 HiCmp = DAG.getConstant(0, dl, VT);
33873 }
33874 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33875
33876 // Widen the result with by padding with undef.
33877 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33878 DAG.getUNDEF(VT));
33879 Results.push_back(Res);
33880 Results.push_back(Ovf);
33881 return;
33882 }
33883 case X86ISD::VPMADDWD: {
33884 // Legalize types for X86ISD::VPMADDWD by widening.
33885 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33886
33887 EVT VT = N->getValueType(0);
33888 EVT InVT = N->getOperand(0).getValueType();
33889 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33890 "Expected a VT that divides into 128 bits.");
33892 "Unexpected type action!");
33893 unsigned NumConcat = 128 / InVT.getSizeInBits();
33894
33895 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33896 InVT.getVectorElementType(),
33897 NumConcat * InVT.getVectorNumElements());
33898 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33900 NumConcat * VT.getVectorNumElements());
33901
33902 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33903 Ops[0] = N->getOperand(0);
33904 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33905 Ops[0] = N->getOperand(1);
33906 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33907
33908 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33909 Results.push_back(Res);
33910 return;
33911 }
33912 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33913 case X86ISD::FMINC:
33914 case X86ISD::FMIN:
33915 case X86ISD::FMAXC:
33916 case X86ISD::FMAX:
33918 case X86ISD::STRICT_FMAX: {
33919 EVT VT = N->getValueType(0);
33920 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33921 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33922 SDValue UNDEF = DAG.getUNDEF(VT);
33923 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33924 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33925 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33926 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33927 SDValue Res;
33928 if (IsStrict)
33929 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33930 {N->getOperand(0), LHS, RHS});
33931 else
33932 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33933 Results.push_back(Res);
33934 if (IsStrict)
33935 Results.push_back(Res.getValue(1));
33936 return;
33937 }
33938 case ISD::SDIV:
33939 case ISD::UDIV:
33940 case ISD::SREM:
33941 case ISD::UREM: {
33942 EVT VT = N->getValueType(0);
33943 if (VT.isVector()) {
33945 "Unexpected type action!");
33946 // If this RHS is a constant splat vector we can widen this and let
33947 // division/remainder by constant optimize it.
33948 // TODO: Can we do something for non-splat?
33949 APInt SplatVal;
33950 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33951 unsigned NumConcats = 128 / VT.getSizeInBits();
33952 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33953 Ops0[0] = N->getOperand(0);
33954 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33955 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33956 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33957 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33958 Results.push_back(Res);
33959 }
33960 return;
33961 }
33962
33963 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33964 Results.push_back(V);
33965 return;
33966 }
33967 case ISD::TRUNCATE: {
33968 MVT VT = N->getSimpleValueType(0);
33969 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33970 return;
33971
33972 // The generic legalizer will try to widen the input type to the same
33973 // number of elements as the widened result type. But this isn't always
33974 // the best thing so do some custom legalization to avoid some cases.
33975 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33976 SDValue In = N->getOperand(0);
33977 EVT InVT = In.getValueType();
33978 EVT InEltVT = InVT.getVectorElementType();
33979 EVT EltVT = VT.getVectorElementType();
33980 unsigned MinElts = VT.getVectorNumElements();
33981 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33982 unsigned InBits = InVT.getSizeInBits();
33983
33984 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33985 unsigned PackOpcode;
33986 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33987 Subtarget, N->getFlags())) {
33988 if (SDValue Res =
33989 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
33990 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
33991 Results.push_back(Res);
33992 return;
33993 }
33994 }
33995
33996 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
33997 // 128 bit and smaller inputs should avoid truncate all together and
33998 // use a shuffle.
33999 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34000 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34001 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34002 for (unsigned I = 0; I < MinElts; ++I)
34003 TruncMask[I] = Scale * I;
34004 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34005 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34006 "Illegal vector type in truncation");
34007 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34008 Results.push_back(
34009 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34010 return;
34011 }
34012 }
34013
34014 // With AVX512 there are some cases that can use a target specific
34015 // truncate node to go from 256/512 to less than 128 with zeros in the
34016 // upper elements of the 128 bit result.
34017 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34018 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34019 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34020 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34021 return;
34022 }
34023 // There's one case we can widen to 512 bits and use VTRUNC.
34024 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34025 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34026 DAG.getUNDEF(MVT::v4i64));
34027 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34028 return;
34029 }
34030 }
34031 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34032 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34033 isTypeLegal(MVT::v4i64)) {
34034 // Input needs to be split and output needs to widened. Let's use two
34035 // VTRUNCs, and shuffle their results together into the wider type.
34036 SDValue Lo, Hi;
34037 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34038
34039 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34040 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34041 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34042 { 0, 1, 2, 3, 16, 17, 18, 19,
34043 -1, -1, -1, -1, -1, -1, -1, -1 });
34044 Results.push_back(Res);
34045 return;
34046 }
34047
34048 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34049 // this via type legalization.
34050 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34051 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34052 (!Subtarget.hasSSSE3() ||
34053 (!isTypeLegal(InVT) &&
34054 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34055 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34056 InEltVT.getSizeInBits() * WidenNumElts);
34057 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34058 return;
34059 }
34060
34061 return;
34062 }
34063 case ISD::ANY_EXTEND:
34064 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34065 // It's intended to custom handle the input type.
34066 assert(N->getValueType(0) == MVT::v8i8 &&
34067 "Do not know how to legalize this Node");
34068 return;
34069 case ISD::SIGN_EXTEND:
34070 case ISD::ZERO_EXTEND: {
34071 EVT VT = N->getValueType(0);
34072 SDValue In = N->getOperand(0);
34073 EVT InVT = In.getValueType();
34074 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34075 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34077 "Unexpected type action!");
34078 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34079 // Custom split this so we can extend i8/i16->i32 invec. This is better
34080 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34081 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34082 // we allow the sra from the extend to i32 to be shared by the split.
34083 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34084
34085 // Fill a vector with sign bits for each element.
34086 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34087 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34088
34089 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34090 // to v2i64.
34091 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34092 {0, 4, 1, 5});
34093 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34094 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34095 {2, 6, 3, 7});
34096 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34097
34098 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34099 Results.push_back(Res);
34100 return;
34101 }
34102
34103 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34104 if (!InVT.is128BitVector()) {
34105 // Not a 128 bit vector, but maybe type legalization will promote
34106 // it to 128 bits.
34107 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34108 return;
34109 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34110 if (!InVT.is128BitVector())
34111 return;
34112
34113 // Promote the input to 128 bits. Type legalization will turn this into
34114 // zext_inreg/sext_inreg.
34115 In = DAG.getNode(Opc, dl, InVT, In);
34116 }
34117
34118 // Perform custom splitting instead of the two stage extend we would get
34119 // by default.
34120 EVT LoVT, HiVT;
34121 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34122 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34123
34124 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34125
34126 // We need to shift the input over by half the number of elements.
34127 unsigned NumElts = InVT.getVectorNumElements();
34128 unsigned HalfNumElts = NumElts / 2;
34129 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34130 for (unsigned i = 0; i != HalfNumElts; ++i)
34131 ShufMask[i] = i + HalfNumElts;
34132
34133 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34134 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34135
34136 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34137 Results.push_back(Res);
34138 }
34139 return;
34140 }
34142 case ISD::FP_TO_UINT_SAT: {
34143 if (!Subtarget.hasAVX10_2())
34144 return;
34145
34146 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34147 EVT VT = N->getValueType(0);
34148 SDValue Op = N->getOperand(0);
34149 EVT OpVT = Op.getValueType();
34150 SDValue Res;
34151
34152 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34153 if (IsSigned)
34154 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34155 else
34156 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34157 Results.push_back(Res);
34158 }
34159 return;
34160 }
34161 case ISD::FP_TO_SINT:
34163 case ISD::FP_TO_UINT:
34165 bool IsStrict = N->isStrictFPOpcode();
34166 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34167 EVT VT = N->getValueType(0);
34168 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34169 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34170 EVT SrcVT = Src.getValueType();
34171
34172 SDValue Res;
34173 if (isSoftF16(SrcVT, Subtarget)) {
34174 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34175 if (IsStrict) {
34176 Res =
34177 DAG.getNode(Opc, dl, {VT, MVT::Other},
34178 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34179 {NVT, MVT::Other}, {Chain, Src})});
34180 Chain = Res.getValue(1);
34181 } else {
34182 Res =
34183 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34184 }
34185 Results.push_back(Res);
34186 if (IsStrict)
34187 Results.push_back(Chain);
34188
34189 return;
34190 }
34191
34192 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34193 SrcVT.getVectorElementType() == MVT::f16) {
34194 EVT EleVT = VT.getVectorElementType();
34195 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34196
34197 if (SrcVT != MVT::v8f16) {
34198 SDValue Tmp =
34199 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34200 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34201 Ops[0] = Src;
34202 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34203 }
34204
34205 if (IsStrict) {
34207 Res =
34208 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34209 Chain = Res.getValue(1);
34210 } else {
34211 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34212 Res = DAG.getNode(Opc, dl, ResVT, Src);
34213 }
34214
34215 // TODO: Need to add exception check code for strict FP.
34216 if (EleVT.getSizeInBits() < 16) {
34217 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34218 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34219
34220 // Now widen to 128 bits.
34221 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34222 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34223 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34224 ConcatOps[0] = Res;
34225 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34226 }
34227
34228 Results.push_back(Res);
34229 if (IsStrict)
34230 Results.push_back(Chain);
34231
34232 return;
34233 }
34234
34235 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34237 "Unexpected type action!");
34238
34239 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34240 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34241 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34243 SDValue Res;
34244 SDValue Chain;
34245 if (IsStrict) {
34246 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34247 {N->getOperand(0), Src});
34248 Chain = Res.getValue(1);
34249 } else
34250 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34251
34252 // Preserve what we know about the size of the original result. If the
34253 // result is v2i32, we have to manually widen the assert.
34254 if (PromoteVT == MVT::v2i32)
34255 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34256 DAG.getUNDEF(MVT::v2i32));
34257
34258 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34259 Res.getValueType(), Res,
34261
34262 if (PromoteVT == MVT::v2i32)
34263 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34264 DAG.getVectorIdxConstant(0, dl));
34265
34266 // Truncate back to the original width.
34267 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34268
34269 // Now widen to 128 bits.
34270 unsigned NumConcats = 128 / VT.getSizeInBits();
34272 VT.getVectorNumElements() * NumConcats);
34273 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34274 ConcatOps[0] = Res;
34275 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34276 Results.push_back(Res);
34277 if (IsStrict)
34278 Results.push_back(Chain);
34279 return;
34280 }
34281
34282
34283 if (VT == MVT::v2i32) {
34284 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34285 "Strict unsigned conversion requires AVX512");
34286 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34288 "Unexpected type action!");
34289 if (Src.getValueType() == MVT::v2f64) {
34290 if (!IsSigned && !Subtarget.hasAVX512()) {
34291 SDValue Res =
34292 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34293 Results.push_back(Res);
34294 return;
34295 }
34296
34297 if (IsStrict)
34299 else
34300 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34301
34302 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34303 if (!IsSigned && !Subtarget.hasVLX()) {
34304 // Otherwise we can defer to the generic legalizer which will widen
34305 // the input as well. This will be further widened during op
34306 // legalization to v8i32<-v8f64.
34307 // For strict nodes we'll need to widen ourselves.
34308 // FIXME: Fix the type legalizer to safely widen strict nodes?
34309 if (!IsStrict)
34310 return;
34311 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34312 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34313 Opc = N->getOpcode();
34314 }
34315 SDValue Res;
34316 SDValue Chain;
34317 if (IsStrict) {
34318 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34319 {N->getOperand(0), Src});
34320 Chain = Res.getValue(1);
34321 } else {
34322 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34323 }
34324 Results.push_back(Res);
34325 if (IsStrict)
34326 Results.push_back(Chain);
34327 return;
34328 }
34329
34330 // Custom widen strict v2f32->v2i32 by padding with zeros.
34331 // FIXME: Should generic type legalizer do this?
34332 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34333 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34334 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34335 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34336 {N->getOperand(0), Src});
34337 Results.push_back(Res);
34338 Results.push_back(Res.getValue(1));
34339 return;
34340 }
34341
34342 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34343 // so early out here.
34344 return;
34345 }
34346
34347 assert(!VT.isVector() && "Vectors should have been handled above!");
34348
34349 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34350 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34351 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34352 assert(!Subtarget.is64Bit() && "i64 should be legal");
34353 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34354 // If we use a 128-bit result we might need to use a target specific node.
34355 unsigned SrcElts =
34356 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34357 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34358 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34359 if (NumElts != SrcElts) {
34360 if (IsStrict)
34362 else
34363 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34364 }
34365
34366 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34367 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34368 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34369 ZeroIdx);
34370 SDValue Chain;
34371 if (IsStrict) {
34372 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34373 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34374 Chain = Res.getValue(1);
34375 } else
34376 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34377 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34378 Results.push_back(Res);
34379 if (IsStrict)
34380 Results.push_back(Chain);
34381 return;
34382 }
34383
34384 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34385 SDValue Chain;
34386 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34387 Results.push_back(V);
34388 if (IsStrict)
34389 Results.push_back(Chain);
34390 return;
34391 }
34392
34393 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34394 Results.push_back(V);
34395 if (IsStrict)
34396 Results.push_back(Chain);
34397 }
34398 return;
34399 }
34400 case ISD::LRINT:
34401 if (N->getValueType(0) == MVT::v2i32) {
34402 SDValue Src = N->getOperand(0);
34403 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34404 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34405 DAG.getUNDEF(MVT::v2f16));
34406 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34407 DAG.getUNDEF(MVT::v4f16));
34408 } else if (Src.getValueType() != MVT::v2f64) {
34409 return;
34410 }
34411 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34412 return;
34413 }
34414 [[fallthrough]];
34415 case ISD::LLRINT: {
34416 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34417 Results.push_back(V);
34418 return;
34419 }
34420
34421 case ISD::SINT_TO_FP:
34423 case ISD::UINT_TO_FP:
34425 bool IsStrict = N->isStrictFPOpcode();
34426 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34427 EVT VT = N->getValueType(0);
34428 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34429 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34430 Subtarget.hasVLX()) {
34431 if (Src.getValueType().getVectorElementType() == MVT::i16)
34432 return;
34433
34434 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34435 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34436 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34437 : DAG.getUNDEF(MVT::v2i32));
34438 if (IsStrict) {
34439 unsigned Opc =
34441 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34442 {N->getOperand(0), Src});
34443 Results.push_back(Res);
34444 Results.push_back(Res.getValue(1));
34445 } else {
34446 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34447 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34448 }
34449 return;
34450 }
34451 if (VT != MVT::v2f32)
34452 return;
34453 EVT SrcVT = Src.getValueType();
34454 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34455 if (IsStrict) {
34456 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34458 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34459 {N->getOperand(0), Src});
34460 Results.push_back(Res);
34461 Results.push_back(Res.getValue(1));
34462 } else {
34463 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34464 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34465 }
34466 return;
34467 }
34468 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34469 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34470 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34471 SDValue One = DAG.getConstant(1, dl, SrcVT);
34472 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34473 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34474 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34475 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34476 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34477 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34478 for (int i = 0; i != 2; ++i) {
34479 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34480 SignSrc, DAG.getVectorIdxConstant(i, dl));
34481 if (IsStrict)
34482 SignCvts[i] =
34483 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34484 {N->getOperand(0), Elt});
34485 else
34486 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34487 };
34488 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34489 SDValue Slow, Chain;
34490 if (IsStrict) {
34491 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34492 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34493 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34494 {Chain, SignCvt, SignCvt});
34495 Chain = Slow.getValue(1);
34496 } else {
34497 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34498 }
34499 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34500 IsNeg =
34501 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34502 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34503 Results.push_back(Cvt);
34504 if (IsStrict)
34505 Results.push_back(Chain);
34506 return;
34507 }
34508
34509 if (SrcVT != MVT::v2i32)
34510 return;
34511
34512 if (IsSigned || Subtarget.hasAVX512()) {
34513 if (!IsStrict)
34514 return;
34515
34516 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34517 // FIXME: Should generic type legalizer do this?
34518 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34519 DAG.getConstant(0, dl, MVT::v2i32));
34520 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34521 {N->getOperand(0), Src});
34522 Results.push_back(Res);
34523 Results.push_back(Res.getValue(1));
34524 return;
34525 }
34526
34527 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34528 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34529 SDValue VBias = DAG.getConstantFP(
34530 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34531 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34532 DAG.getBitcast(MVT::v2i64, VBias));
34533 Or = DAG.getBitcast(MVT::v2f64, Or);
34534 if (IsStrict) {
34535 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34536 {N->getOperand(0), Or, VBias});
34538 {MVT::v4f32, MVT::Other},
34539 {Sub.getValue(1), Sub});
34540 Results.push_back(Res);
34541 Results.push_back(Res.getValue(1));
34542 } else {
34543 // TODO: Are there any fast-math-flags to propagate here?
34544 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34545 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34546 }
34547 return;
34548 }
34550 case ISD::FP_ROUND: {
34551 bool IsStrict = N->isStrictFPOpcode();
34552 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34553 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34554 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34555 EVT SrcVT = Src.getValueType();
34556 EVT VT = N->getValueType(0);
34557 SDValue V;
34558 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34559 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34560 : DAG.getUNDEF(MVT::v2f32);
34561 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34562 }
34563 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34564 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34565 if (SrcVT.getVectorElementType() != MVT::f32)
34566 return;
34567
34568 if (IsStrict)
34569 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34570 {Chain, Src, Rnd});
34571 else
34572 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34573
34574 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34575 if (IsStrict)
34576 Results.push_back(V.getValue(1));
34577 return;
34578 }
34579 if (!isTypeLegal(Src.getValueType()))
34580 return;
34581 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34582 if (IsStrict)
34583 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34584 {Chain, Src});
34585 else
34586 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34587 Results.push_back(V);
34588 if (IsStrict)
34589 Results.push_back(V.getValue(1));
34590 return;
34591 }
34592 case ISD::FP_EXTEND:
34593 case ISD::STRICT_FP_EXTEND: {
34594 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34595 // No other ValueType for FP_EXTEND should reach this point.
34596 assert(N->getValueType(0) == MVT::v2f32 &&
34597 "Do not know how to legalize this Node");
34598 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34599 return;
34600 bool IsStrict = N->isStrictFPOpcode();
34601 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34602 if (Src.getValueType().getVectorElementType() != MVT::f16)
34603 return;
34604 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34605 : DAG.getUNDEF(MVT::v2f16);
34606 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34607 if (IsStrict)
34608 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34609 {N->getOperand(0), V});
34610 else
34611 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34612 Results.push_back(V);
34613 if (IsStrict)
34614 Results.push_back(V.getValue(1));
34615 return;
34616 }
34618 unsigned IntNo = N->getConstantOperandVal(1);
34619 switch (IntNo) {
34620 default : llvm_unreachable("Do not know how to custom type "
34621 "legalize this intrinsic operation!");
34622 case Intrinsic::x86_rdtsc:
34623 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34624 Results);
34625 case Intrinsic::x86_rdtscp:
34626 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34627 Results);
34628 case Intrinsic::x86_rdpmc:
34629 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34630 Results);
34631 return;
34632 case Intrinsic::x86_rdpru:
34633 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34634 Results);
34635 return;
34636 case Intrinsic::x86_xgetbv:
34637 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34638 Results);
34639 return;
34640 }
34641 }
34642 case ISD::READCYCLECOUNTER: {
34643 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34644 }
34646 EVT T = N->getValueType(0);
34647 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34648 bool Regs64bit = T == MVT::i128;
34649 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34650 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34651 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34652 SDValue cpInL, cpInH;
34653 std::tie(cpInL, cpInH) =
34654 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34655 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34656 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34657 cpInH =
34658 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34659 cpInH, cpInL.getValue(1));
34660 SDValue swapInL, swapInH;
34661 std::tie(swapInL, swapInH) =
34662 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34663 swapInH =
34664 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34665 swapInH, cpInH.getValue(1));
34666
34667 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34668 // until later. So we keep the RBX input in a vreg and use a custom
34669 // inserter.
34670 // Since RBX will be a reserved register the register allocator will not
34671 // make sure its value will be properly saved and restored around this
34672 // live-range.
34673 SDValue Result;
34674 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34675 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34676 if (Regs64bit) {
34677 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34678 swapInH.getValue(1)};
34679 Result =
34680 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34681 } else {
34682 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34683 swapInH.getValue(1));
34684 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34685 swapInL.getValue(1)};
34686 Result =
34687 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34688 }
34689
34690 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34691 Regs64bit ? X86::RAX : X86::EAX,
34692 HalfT, Result.getValue(1));
34693 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34694 Regs64bit ? X86::RDX : X86::EDX,
34695 HalfT, cpOutL.getValue(2));
34696 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34697
34698 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34699 MVT::i32, cpOutH.getValue(2));
34700 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34701 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34702
34703 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34704 Results.push_back(Success);
34705 Results.push_back(EFLAGS.getValue(1));
34706 return;
34707 }
34708 case ISD::ATOMIC_LOAD: {
34709 assert(
34710 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34711 "Unexpected VT!");
34712 bool NoImplicitFloatOps =
34714 Attribute::NoImplicitFloat);
34715 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34716 auto *Node = cast<AtomicSDNode>(N);
34717
34718 if (N->getValueType(0) == MVT::i128) {
34719 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34720 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34721 Node->getBasePtr(), Node->getMemOperand());
34722 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34723 DAG.getVectorIdxConstant(0, dl));
34724 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34725 DAG.getVectorIdxConstant(1, dl));
34726 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34727 {ResL, ResH}));
34728 Results.push_back(Ld.getValue(1));
34729 return;
34730 }
34731 break;
34732 }
34733 if (Subtarget.hasSSE1()) {
34734 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34735 // Then extract the lower 64-bits.
34736 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34737 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34738 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34739 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34740 MVT::i64, Node->getMemOperand());
34741 if (Subtarget.hasSSE2()) {
34742 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34743 DAG.getVectorIdxConstant(0, dl));
34744 Results.push_back(Res);
34745 Results.push_back(Ld.getValue(1));
34746 return;
34747 }
34748 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34749 // then casts to i64. This avoids a 128-bit stack temporary being
34750 // created by type legalization if we were to cast v4f32->v2i64.
34751 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34752 DAG.getVectorIdxConstant(0, dl));
34753 Res = DAG.getBitcast(MVT::i64, Res);
34754 Results.push_back(Res);
34755 Results.push_back(Ld.getValue(1));
34756 return;
34757 }
34758 if (Subtarget.hasX87()) {
34759 // First load this into an 80-bit X87 register. This will put the whole
34760 // integer into the significand.
34761 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34762 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34764 dl, Tys, Ops, MVT::i64,
34765 Node->getMemOperand());
34766 SDValue Chain = Result.getValue(1);
34767
34768 // Now store the X87 register to a stack temporary and convert to i64.
34769 // This store is not atomic and doesn't need to be.
34770 // FIXME: We don't need a stack temporary if the result of the load
34771 // is already being stored. We could just directly store there.
34772 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34773 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34774 MachinePointerInfo MPI =
34776 SDValue StoreOps[] = { Chain, Result, StackPtr };
34777 Chain = DAG.getMemIntrinsicNode(
34778 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34779 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34780
34781 // Finally load the value back from the stack temporary and return it.
34782 // This load is not atomic and doesn't need to be.
34783 // This load will be further type legalized.
34784 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34785 Results.push_back(Result);
34786 Results.push_back(Result.getValue(1));
34787 return;
34788 }
34789 }
34790 // TODO: Use MOVLPS when SSE1 is available?
34791 // Delegate to generic TypeLegalization. Situations we can really handle
34792 // should have already been dealt with by AtomicExpandPass.cpp.
34793 break;
34794 }
34795 case ISD::ATOMIC_SWAP:
34806 // Delegate to generic TypeLegalization. Situations we can really handle
34807 // should have already been dealt with by AtomicExpandPass.cpp.
34808 break;
34809
34810 case ISD::BITCAST: {
34811 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34812 EVT DstVT = N->getValueType(0);
34813 EVT SrcVT = N->getOperand(0).getValueType();
34814
34815 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34816 // we can split using the k-register rather than memory.
34817 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34818 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34819 SDValue Lo, Hi;
34820 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34821 Lo = DAG.getBitcast(MVT::i32, Lo);
34822 Hi = DAG.getBitcast(MVT::i32, Hi);
34823 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34824 Results.push_back(Res);
34825 return;
34826 }
34827
34828 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34829 // FIXME: Use v4f32 for SSE1?
34830 assert(Subtarget.hasSSE2() && "Requires SSE2");
34831 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34832 "Unexpected type action!");
34833 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34834 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34835 N->getOperand(0));
34836 Res = DAG.getBitcast(WideVT, Res);
34837 Results.push_back(Res);
34838 return;
34839 }
34840
34841 return;
34842 }
34843 case ISD::MGATHER: {
34844 EVT VT = N->getValueType(0);
34845 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34846 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34847 auto *Gather = cast<MaskedGatherSDNode>(N);
34848 SDValue Index = Gather->getIndex();
34849 if (Index.getValueType() != MVT::v2i64)
34850 return;
34852 "Unexpected type action!");
34853 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34854 SDValue Mask = Gather->getMask();
34855 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34856 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34857 Gather->getPassThru(),
34858 DAG.getUNDEF(VT));
34859 if (!Subtarget.hasVLX()) {
34860 // We need to widen the mask, but the instruction will only use 2
34861 // of its elements. So we can use undef.
34862 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34863 DAG.getUNDEF(MVT::v2i1));
34864 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34865 }
34866 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34867 Gather->getBasePtr(), Index, Gather->getScale() };
34868 SDValue Res = DAG.getMemIntrinsicNode(
34869 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34870 Gather->getMemoryVT(), Gather->getMemOperand());
34871 Results.push_back(Res);
34872 Results.push_back(Res.getValue(1));
34873 return;
34874 }
34875 return;
34876 }
34877 case ISD::LOAD: {
34878 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34879 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34880 // cast since type legalization will try to use an i64 load.
34881 MVT VT = N->getSimpleValueType(0);
34882 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34884 "Unexpected type action!");
34885 if (!ISD::isNON_EXTLoad(N))
34886 return;
34887 auto *Ld = cast<LoadSDNode>(N);
34888 if (Subtarget.hasSSE2()) {
34889 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34890 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34891 Ld->getPointerInfo(), Ld->getBaseAlign(),
34892 Ld->getMemOperand()->getFlags());
34893 SDValue Chain = Res.getValue(1);
34894 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34895 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34896 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34897 Res = DAG.getBitcast(WideVT, Res);
34898 Results.push_back(Res);
34899 Results.push_back(Chain);
34900 return;
34901 }
34902 assert(Subtarget.hasSSE1() && "Expected SSE");
34903 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34904 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34905 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34906 MVT::i64, Ld->getMemOperand());
34907 Results.push_back(Res);
34908 Results.push_back(Res.getValue(1));
34909 return;
34910 }
34911 case ISD::ADDRSPACECAST: {
34912 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34913 Results.push_back(V);
34914 return;
34915 }
34916 case ISD::BITREVERSE: {
34917 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34918 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34919 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34920 // We'll need to move the scalar in two i32 pieces.
34921 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34922 return;
34923 }
34925 // f16 = extract vXf16 %vec, i64 %idx
34926 assert(N->getSimpleValueType(0) == MVT::f16 &&
34927 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34928 assert(Subtarget.hasFP16() && "Expected FP16");
34929 SDValue VecOp = N->getOperand(0);
34931 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34932 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34933 N->getOperand(1));
34934 Split = DAG.getBitcast(MVT::f16, Split);
34935 Results.push_back(Split);
34936 return;
34937 }
34938 }
34939}
34940
34941const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34942 switch ((X86ISD::NodeType)Opcode) {
34943 case X86ISD::FIRST_NUMBER: break;
34944#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34945 NODE_NAME_CASE(BSF)
34946 NODE_NAME_CASE(BSR)
34947 NODE_NAME_CASE(FSHL)
34948 NODE_NAME_CASE(FSHR)
34949 NODE_NAME_CASE(FAND)
34950 NODE_NAME_CASE(FANDN)
34951 NODE_NAME_CASE(FOR)
34952 NODE_NAME_CASE(FXOR)
34953 NODE_NAME_CASE(FILD)
34954 NODE_NAME_CASE(FIST)
34955 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34956 NODE_NAME_CASE(FLD)
34957 NODE_NAME_CASE(FST)
34958 NODE_NAME_CASE(CALL)
34959 NODE_NAME_CASE(CALL_RVMARKER)
34960 NODE_NAME_CASE(IMP_CALL)
34962 NODE_NAME_CASE(CMP)
34963 NODE_NAME_CASE(FCMP)
34964 NODE_NAME_CASE(STRICT_FCMP)
34965 NODE_NAME_CASE(STRICT_FCMPS)
34967 NODE_NAME_CASE(UCOMI)
34968 NODE_NAME_CASE(COMX)
34969 NODE_NAME_CASE(UCOMX)
34970 NODE_NAME_CASE(CMPM)
34971 NODE_NAME_CASE(CMPMM)
34972 NODE_NAME_CASE(STRICT_CMPM)
34973 NODE_NAME_CASE(CMPMM_SAE)
34974 NODE_NAME_CASE(SETCC)
34975 NODE_NAME_CASE(SETCC_CARRY)
34976 NODE_NAME_CASE(FSETCC)
34977 NODE_NAME_CASE(FSETCCM)
34978 NODE_NAME_CASE(FSETCCM_SAE)
34979 NODE_NAME_CASE(CMOV)
34980 NODE_NAME_CASE(BRCOND)
34981 NODE_NAME_CASE(RET_GLUE)
34982 NODE_NAME_CASE(IRET)
34983 NODE_NAME_CASE(REP_STOS)
34984 NODE_NAME_CASE(REP_MOVS)
34985 NODE_NAME_CASE(GlobalBaseReg)
34987 NODE_NAME_CASE(WrapperRIP)
34988 NODE_NAME_CASE(MOVQ2DQ)
34989 NODE_NAME_CASE(MOVDQ2Q)
34990 NODE_NAME_CASE(MMX_MOVD2W)
34991 NODE_NAME_CASE(MMX_MOVW2D)
34992 NODE_NAME_CASE(PEXTRB)
34993 NODE_NAME_CASE(PEXTRW)
34994 NODE_NAME_CASE(INSERTPS)
34995 NODE_NAME_CASE(PINSRB)
34996 NODE_NAME_CASE(PINSRW)
34997 NODE_NAME_CASE(PSHUFB)
34998 NODE_NAME_CASE(ANDNP)
34999 NODE_NAME_CASE(BLENDI)
35001 NODE_NAME_CASE(HADD)
35002 NODE_NAME_CASE(HSUB)
35003 NODE_NAME_CASE(FHADD)
35004 NODE_NAME_CASE(FHSUB)
35005 NODE_NAME_CASE(CONFLICT)
35006 NODE_NAME_CASE(FMAX)
35007 NODE_NAME_CASE(FMAXS)
35008 NODE_NAME_CASE(FMAX_SAE)
35009 NODE_NAME_CASE(FMAXS_SAE)
35010 NODE_NAME_CASE(STRICT_FMAX)
35011 NODE_NAME_CASE(FMIN)
35012 NODE_NAME_CASE(FMINS)
35013 NODE_NAME_CASE(FMIN_SAE)
35014 NODE_NAME_CASE(FMINS_SAE)
35015 NODE_NAME_CASE(STRICT_FMIN)
35016 NODE_NAME_CASE(FMAXC)
35017 NODE_NAME_CASE(FMINC)
35018 NODE_NAME_CASE(FRSQRT)
35019 NODE_NAME_CASE(FRCP)
35020 NODE_NAME_CASE(EXTRQI)
35021 NODE_NAME_CASE(INSERTQI)
35022 NODE_NAME_CASE(TLSADDR)
35023 NODE_NAME_CASE(TLSBASEADDR)
35024 NODE_NAME_CASE(TLSCALL)
35025 NODE_NAME_CASE(TLSDESC)
35026 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35027 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35028 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35029 NODE_NAME_CASE(EH_RETURN)
35030 NODE_NAME_CASE(TC_RETURN)
35031 NODE_NAME_CASE(FNSTCW16m)
35032 NODE_NAME_CASE(FLDCW16m)
35033 NODE_NAME_CASE(FNSTENVm)
35034 NODE_NAME_CASE(FLDENVm)
35035 NODE_NAME_CASE(LCMPXCHG_DAG)
35036 NODE_NAME_CASE(LCMPXCHG8_DAG)
35037 NODE_NAME_CASE(LCMPXCHG16_DAG)
35038 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35039 NODE_NAME_CASE(LADD)
35040 NODE_NAME_CASE(LSUB)
35041 NODE_NAME_CASE(LOR)
35042 NODE_NAME_CASE(LXOR)
35043 NODE_NAME_CASE(LAND)
35044 NODE_NAME_CASE(LBTS)
35045 NODE_NAME_CASE(LBTC)
35046 NODE_NAME_CASE(LBTR)
35047 NODE_NAME_CASE(LBTS_RM)
35048 NODE_NAME_CASE(LBTC_RM)
35049 NODE_NAME_CASE(LBTR_RM)
35050 NODE_NAME_CASE(AADD)
35051 NODE_NAME_CASE(AOR)
35052 NODE_NAME_CASE(AXOR)
35053 NODE_NAME_CASE(AAND)
35054 NODE_NAME_CASE(VZEXT_MOVL)
35055 NODE_NAME_CASE(VZEXT_LOAD)
35056 NODE_NAME_CASE(VEXTRACT_STORE)
35057 NODE_NAME_CASE(VTRUNC)
35058 NODE_NAME_CASE(VTRUNCS)
35059 NODE_NAME_CASE(VTRUNCUS)
35060 NODE_NAME_CASE(VMTRUNC)
35061 NODE_NAME_CASE(VMTRUNCS)
35062 NODE_NAME_CASE(VMTRUNCUS)
35063 NODE_NAME_CASE(VTRUNCSTORES)
35064 NODE_NAME_CASE(VTRUNCSTOREUS)
35065 NODE_NAME_CASE(VMTRUNCSTORES)
35066 NODE_NAME_CASE(VMTRUNCSTOREUS)
35067 NODE_NAME_CASE(VFPEXT)
35068 NODE_NAME_CASE(STRICT_VFPEXT)
35069 NODE_NAME_CASE(VFPEXT_SAE)
35070 NODE_NAME_CASE(VFPEXTS)
35071 NODE_NAME_CASE(VFPEXTS_SAE)
35072 NODE_NAME_CASE(VFPROUND)
35073 NODE_NAME_CASE(VFPROUND2)
35074 NODE_NAME_CASE(VFPROUND2_RND)
35075 NODE_NAME_CASE(STRICT_VFPROUND)
35076 NODE_NAME_CASE(VMFPROUND)
35077 NODE_NAME_CASE(VFPROUND_RND)
35078 NODE_NAME_CASE(VFPROUNDS)
35079 NODE_NAME_CASE(VFPROUNDS_RND)
35080 NODE_NAME_CASE(VSHLDQ)
35081 NODE_NAME_CASE(VSRLDQ)
35082 NODE_NAME_CASE(VSHL)
35083 NODE_NAME_CASE(VSRL)
35084 NODE_NAME_CASE(VSRA)
35085 NODE_NAME_CASE(VSHLI)
35086 NODE_NAME_CASE(VSRLI)
35087 NODE_NAME_CASE(VSRAI)
35088 NODE_NAME_CASE(VSHLV)
35089 NODE_NAME_CASE(VSRLV)
35090 NODE_NAME_CASE(VSRAV)
35091 NODE_NAME_CASE(VROTLI)
35092 NODE_NAME_CASE(VROTRI)
35093 NODE_NAME_CASE(VPPERM)
35094 NODE_NAME_CASE(CMPP)
35095 NODE_NAME_CASE(STRICT_CMPP)
35096 NODE_NAME_CASE(PCMPEQ)
35097 NODE_NAME_CASE(PCMPGT)
35098 NODE_NAME_CASE(PHMINPOS)
35099 NODE_NAME_CASE(ADD)
35100 NODE_NAME_CASE(SUB)
35101 NODE_NAME_CASE(ADC)
35102 NODE_NAME_CASE(SBB)
35103 NODE_NAME_CASE(SMUL)
35104 NODE_NAME_CASE(UMUL)
35105 NODE_NAME_CASE(OR)
35106 NODE_NAME_CASE(XOR)
35107 NODE_NAME_CASE(AND)
35108 NODE_NAME_CASE(BEXTR)
35110 NODE_NAME_CASE(BZHI)
35111 NODE_NAME_CASE(PDEP)
35112 NODE_NAME_CASE(PEXT)
35113 NODE_NAME_CASE(MUL_IMM)
35114 NODE_NAME_CASE(MOVMSK)
35115 NODE_NAME_CASE(PTEST)
35116 NODE_NAME_CASE(TESTP)
35117 NODE_NAME_CASE(KORTEST)
35118 NODE_NAME_CASE(KTEST)
35119 NODE_NAME_CASE(KADD)
35120 NODE_NAME_CASE(KSHIFTL)
35121 NODE_NAME_CASE(KSHIFTR)
35122 NODE_NAME_CASE(PACKSS)
35123 NODE_NAME_CASE(PACKUS)
35124 NODE_NAME_CASE(PALIGNR)
35125 NODE_NAME_CASE(VALIGN)
35126 NODE_NAME_CASE(VSHLD)
35127 NODE_NAME_CASE(VSHRD)
35128 NODE_NAME_CASE(VSHLDV)
35129 NODE_NAME_CASE(VSHRDV)
35130 NODE_NAME_CASE(PSHUFD)
35131 NODE_NAME_CASE(PSHUFHW)
35132 NODE_NAME_CASE(PSHUFLW)
35133 NODE_NAME_CASE(SHUFP)
35134 NODE_NAME_CASE(SHUF128)
35135 NODE_NAME_CASE(MOVLHPS)
35136 NODE_NAME_CASE(MOVHLPS)
35137 NODE_NAME_CASE(MOVDDUP)
35138 NODE_NAME_CASE(MOVSHDUP)
35139 NODE_NAME_CASE(MOVSLDUP)
35140 NODE_NAME_CASE(MOVSD)
35141 NODE_NAME_CASE(MOVSS)
35142 NODE_NAME_CASE(MOVSH)
35143 NODE_NAME_CASE(UNPCKL)
35144 NODE_NAME_CASE(UNPCKH)
35145 NODE_NAME_CASE(VBROADCAST)
35146 NODE_NAME_CASE(VBROADCAST_LOAD)
35147 NODE_NAME_CASE(VBROADCASTM)
35148 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35149 NODE_NAME_CASE(VPERMILPV)
35150 NODE_NAME_CASE(VPERMILPI)
35151 NODE_NAME_CASE(VPERM2X128)
35152 NODE_NAME_CASE(VPERMV)
35153 NODE_NAME_CASE(VPERMV3)
35154 NODE_NAME_CASE(VPERMI)
35155 NODE_NAME_CASE(VPTERNLOG)
35156 NODE_NAME_CASE(FP_TO_SINT_SAT)
35157 NODE_NAME_CASE(FP_TO_UINT_SAT)
35158 NODE_NAME_CASE(VFIXUPIMM)
35159 NODE_NAME_CASE(VFIXUPIMM_SAE)
35160 NODE_NAME_CASE(VFIXUPIMMS)
35161 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35162 NODE_NAME_CASE(VRANGE)
35163 NODE_NAME_CASE(VRANGE_SAE)
35164 NODE_NAME_CASE(VRANGES)
35165 NODE_NAME_CASE(VRANGES_SAE)
35166 NODE_NAME_CASE(PMULUDQ)
35167 NODE_NAME_CASE(PMULDQ)
35168 NODE_NAME_CASE(PSADBW)
35169 NODE_NAME_CASE(DBPSADBW)
35170 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35171 NODE_NAME_CASE(VAARG_64)
35172 NODE_NAME_CASE(VAARG_X32)
35173 NODE_NAME_CASE(DYN_ALLOCA)
35174 NODE_NAME_CASE(MFENCE)
35175 NODE_NAME_CASE(SEG_ALLOCA)
35176 NODE_NAME_CASE(PROBED_ALLOCA)
35179 NODE_NAME_CASE(RDPKRU)
35180 NODE_NAME_CASE(WRPKRU)
35181 NODE_NAME_CASE(VPMADDUBSW)
35182 NODE_NAME_CASE(VPMADDWD)
35183 NODE_NAME_CASE(VPSHA)
35184 NODE_NAME_CASE(VPSHL)
35185 NODE_NAME_CASE(VPCOM)
35186 NODE_NAME_CASE(VPCOMU)
35187 NODE_NAME_CASE(VPERMIL2)
35189 NODE_NAME_CASE(STRICT_FMSUB)
35191 NODE_NAME_CASE(STRICT_FNMADD)
35193 NODE_NAME_CASE(STRICT_FNMSUB)
35194 NODE_NAME_CASE(FMADDSUB)
35195 NODE_NAME_CASE(FMSUBADD)
35196 NODE_NAME_CASE(FMADD_RND)
35197 NODE_NAME_CASE(FNMADD_RND)
35198 NODE_NAME_CASE(FMSUB_RND)
35199 NODE_NAME_CASE(FNMSUB_RND)
35200 NODE_NAME_CASE(FMADDSUB_RND)
35201 NODE_NAME_CASE(FMSUBADD_RND)
35202 NODE_NAME_CASE(VFMADDC)
35203 NODE_NAME_CASE(VFMADDC_RND)
35204 NODE_NAME_CASE(VFCMADDC)
35205 NODE_NAME_CASE(VFCMADDC_RND)
35206 NODE_NAME_CASE(VFMULC)
35207 NODE_NAME_CASE(VFMULC_RND)
35208 NODE_NAME_CASE(VFCMULC)
35209 NODE_NAME_CASE(VFCMULC_RND)
35210 NODE_NAME_CASE(VFMULCSH)
35211 NODE_NAME_CASE(VFMULCSH_RND)
35212 NODE_NAME_CASE(VFCMULCSH)
35213 NODE_NAME_CASE(VFCMULCSH_RND)
35214 NODE_NAME_CASE(VFMADDCSH)
35215 NODE_NAME_CASE(VFMADDCSH_RND)
35216 NODE_NAME_CASE(VFCMADDCSH)
35217 NODE_NAME_CASE(VFCMADDCSH_RND)
35218 NODE_NAME_CASE(VPMADD52H)
35219 NODE_NAME_CASE(VPMADD52L)
35220 NODE_NAME_CASE(VRNDSCALE)
35221 NODE_NAME_CASE(STRICT_VRNDSCALE)
35222 NODE_NAME_CASE(VRNDSCALE_SAE)
35223 NODE_NAME_CASE(VRNDSCALES)
35224 NODE_NAME_CASE(VRNDSCALES_SAE)
35225 NODE_NAME_CASE(VREDUCE)
35226 NODE_NAME_CASE(VREDUCE_SAE)
35227 NODE_NAME_CASE(VREDUCES)
35228 NODE_NAME_CASE(VREDUCES_SAE)
35229 NODE_NAME_CASE(VGETMANT)
35230 NODE_NAME_CASE(VGETMANT_SAE)
35231 NODE_NAME_CASE(VGETMANTS)
35232 NODE_NAME_CASE(VGETMANTS_SAE)
35233 NODE_NAME_CASE(PCMPESTR)
35234 NODE_NAME_CASE(PCMPISTR)
35236 NODE_NAME_CASE(COMPRESS)
35238 NODE_NAME_CASE(SELECTS)
35239 NODE_NAME_CASE(ADDSUB)
35240 NODE_NAME_CASE(RCP14)
35241 NODE_NAME_CASE(RCP14S)
35242 NODE_NAME_CASE(RSQRT14)
35243 NODE_NAME_CASE(RSQRT14S)
35244 NODE_NAME_CASE(FADD_RND)
35245 NODE_NAME_CASE(FADDS)
35246 NODE_NAME_CASE(FADDS_RND)
35247 NODE_NAME_CASE(FSUB_RND)
35248 NODE_NAME_CASE(FSUBS)
35249 NODE_NAME_CASE(FSUBS_RND)
35250 NODE_NAME_CASE(FMUL_RND)
35251 NODE_NAME_CASE(FMULS)
35252 NODE_NAME_CASE(FMULS_RND)
35253 NODE_NAME_CASE(FDIV_RND)
35254 NODE_NAME_CASE(FDIVS)
35255 NODE_NAME_CASE(FDIVS_RND)
35256 NODE_NAME_CASE(FSQRT_RND)
35257 NODE_NAME_CASE(FSQRTS)
35258 NODE_NAME_CASE(FSQRTS_RND)
35259 NODE_NAME_CASE(FGETEXP)
35260 NODE_NAME_CASE(FGETEXP_SAE)
35261 NODE_NAME_CASE(FGETEXPS)
35262 NODE_NAME_CASE(FGETEXPS_SAE)
35263 NODE_NAME_CASE(SCALEF)
35264 NODE_NAME_CASE(SCALEF_RND)
35265 NODE_NAME_CASE(SCALEFS)
35266 NODE_NAME_CASE(SCALEFS_RND)
35267 NODE_NAME_CASE(MULHRS)
35268 NODE_NAME_CASE(SINT_TO_FP_RND)
35269 NODE_NAME_CASE(UINT_TO_FP_RND)
35270 NODE_NAME_CASE(CVTTP2SI)
35271 NODE_NAME_CASE(CVTTP2UI)
35272 NODE_NAME_CASE(STRICT_CVTTP2SI)
35273 NODE_NAME_CASE(STRICT_CVTTP2UI)
35274 NODE_NAME_CASE(MCVTTP2SI)
35275 NODE_NAME_CASE(MCVTTP2UI)
35276 NODE_NAME_CASE(CVTTP2SI_SAE)
35277 NODE_NAME_CASE(CVTTP2UI_SAE)
35278 NODE_NAME_CASE(CVTTS2SI)
35279 NODE_NAME_CASE(CVTTS2UI)
35280 NODE_NAME_CASE(CVTTS2SI_SAE)
35281 NODE_NAME_CASE(CVTTS2UI_SAE)
35282 NODE_NAME_CASE(CVTSI2P)
35283 NODE_NAME_CASE(CVTUI2P)
35284 NODE_NAME_CASE(STRICT_CVTSI2P)
35285 NODE_NAME_CASE(STRICT_CVTUI2P)
35286 NODE_NAME_CASE(MCVTSI2P)
35287 NODE_NAME_CASE(MCVTUI2P)
35288 NODE_NAME_CASE(VFPCLASS)
35289 NODE_NAME_CASE(VFPCLASSS)
35290 NODE_NAME_CASE(MULTISHIFT)
35291 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35292 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35293 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35294 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35295 NODE_NAME_CASE(CVTPS2PH)
35296 NODE_NAME_CASE(STRICT_CVTPS2PH)
35297 NODE_NAME_CASE(CVTPS2PH_SAE)
35298 NODE_NAME_CASE(MCVTPS2PH)
35299 NODE_NAME_CASE(MCVTPS2PH_SAE)
35300 NODE_NAME_CASE(CVTPH2PS)
35301 NODE_NAME_CASE(STRICT_CVTPH2PS)
35302 NODE_NAME_CASE(CVTPH2PS_SAE)
35303 NODE_NAME_CASE(CVTP2SI)
35304 NODE_NAME_CASE(CVTP2UI)
35305 NODE_NAME_CASE(MCVTP2SI)
35306 NODE_NAME_CASE(MCVTP2UI)
35307 NODE_NAME_CASE(CVTP2SI_RND)
35308 NODE_NAME_CASE(CVTP2UI_RND)
35309 NODE_NAME_CASE(CVTS2SI)
35310 NODE_NAME_CASE(CVTS2UI)
35311 NODE_NAME_CASE(CVTS2SI_RND)
35312 NODE_NAME_CASE(CVTS2UI_RND)
35313 NODE_NAME_CASE(CVTNEPS2BF16)
35314 NODE_NAME_CASE(MCVTNEPS2BF16)
35315 NODE_NAME_CASE(DPBF16PS)
35316 NODE_NAME_CASE(DPFP16PS)
35317 NODE_NAME_CASE(MPSADBW)
35318 NODE_NAME_CASE(LWPINS)
35319 NODE_NAME_CASE(MGATHER)
35320 NODE_NAME_CASE(MSCATTER)
35321 NODE_NAME_CASE(VPDPBUSD)
35322 NODE_NAME_CASE(VPDPBUSDS)
35323 NODE_NAME_CASE(VPDPWSSD)
35324 NODE_NAME_CASE(VPDPWSSDS)
35325 NODE_NAME_CASE(VPSHUFBITQMB)
35326 NODE_NAME_CASE(GF2P8MULB)
35327 NODE_NAME_CASE(GF2P8AFFINEQB)
35328 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35329 NODE_NAME_CASE(NT_CALL)
35330 NODE_NAME_CASE(NT_BRIND)
35331 NODE_NAME_CASE(UMWAIT)
35332 NODE_NAME_CASE(TPAUSE)
35333 NODE_NAME_CASE(ENQCMD)
35334 NODE_NAME_CASE(ENQCMDS)
35335 NODE_NAME_CASE(VP2INTERSECT)
35336 NODE_NAME_CASE(VPDPBSUD)
35337 NODE_NAME_CASE(VPDPBSUDS)
35338 NODE_NAME_CASE(VPDPBUUD)
35339 NODE_NAME_CASE(VPDPBUUDS)
35340 NODE_NAME_CASE(VPDPBSSD)
35341 NODE_NAME_CASE(VPDPBSSDS)
35342 NODE_NAME_CASE(VPDPWSUD)
35343 NODE_NAME_CASE(VPDPWSUDS)
35344 NODE_NAME_CASE(VPDPWUSD)
35345 NODE_NAME_CASE(VPDPWUSDS)
35346 NODE_NAME_CASE(VPDPWUUD)
35347 NODE_NAME_CASE(VPDPWUUDS)
35348 NODE_NAME_CASE(VMINMAX)
35349 NODE_NAME_CASE(VMINMAX_SAE)
35350 NODE_NAME_CASE(VMINMAXS)
35351 NODE_NAME_CASE(VMINMAXS_SAE)
35352 NODE_NAME_CASE(CVTP2IBS)
35353 NODE_NAME_CASE(CVTP2IUBS)
35354 NODE_NAME_CASE(CVTP2IBS_RND)
35355 NODE_NAME_CASE(CVTP2IUBS_RND)
35356 NODE_NAME_CASE(CVTTP2IBS)
35357 NODE_NAME_CASE(CVTTP2IUBS)
35358 NODE_NAME_CASE(CVTTP2IBS_SAE)
35359 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35360 NODE_NAME_CASE(VCVT2PH2BF8)
35361 NODE_NAME_CASE(VCVT2PH2BF8S)
35362 NODE_NAME_CASE(VCVT2PH2HF8)
35363 NODE_NAME_CASE(VCVT2PH2HF8S)
35364 NODE_NAME_CASE(VCVTBIASPH2BF8)
35365 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35366 NODE_NAME_CASE(VCVTBIASPH2HF8)
35367 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35368 NODE_NAME_CASE(VCVTPH2BF8)
35369 NODE_NAME_CASE(VCVTPH2BF8S)
35370 NODE_NAME_CASE(VCVTPH2HF8)
35371 NODE_NAME_CASE(VCVTPH2HF8S)
35372 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35373 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35374 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35375 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35376 NODE_NAME_CASE(VMCVTPH2BF8)
35377 NODE_NAME_CASE(VMCVTPH2BF8S)
35378 NODE_NAME_CASE(VMCVTPH2HF8)
35379 NODE_NAME_CASE(VMCVTPH2HF8S)
35380 NODE_NAME_CASE(VCVTHF82PH)
35381 NODE_NAME_CASE(AESENC128KL)
35382 NODE_NAME_CASE(AESDEC128KL)
35383 NODE_NAME_CASE(AESENC256KL)
35384 NODE_NAME_CASE(AESDEC256KL)
35385 NODE_NAME_CASE(AESENCWIDE128KL)
35386 NODE_NAME_CASE(AESDECWIDE128KL)
35387 NODE_NAME_CASE(AESENCWIDE256KL)
35388 NODE_NAME_CASE(AESDECWIDE256KL)
35389 NODE_NAME_CASE(CMPCCXADD)
35390 NODE_NAME_CASE(TESTUI)
35391 NODE_NAME_CASE(FP80_ADD)
35392 NODE_NAME_CASE(STRICT_FP80_ADD)
35393 NODE_NAME_CASE(CCMP)
35394 NODE_NAME_CASE(CTEST)
35395 NODE_NAME_CASE(CLOAD)
35396 NODE_NAME_CASE(CSTORE)
35397 NODE_NAME_CASE(CVTTS2SIS)
35398 NODE_NAME_CASE(CVTTS2UIS)
35399 NODE_NAME_CASE(CVTTS2SIS_SAE)
35400 NODE_NAME_CASE(CVTTS2UIS_SAE)
35401 NODE_NAME_CASE(CVTTP2SIS)
35402 NODE_NAME_CASE(MCVTTP2SIS)
35403 NODE_NAME_CASE(CVTTP2UIS_SAE)
35404 NODE_NAME_CASE(CVTTP2SIS_SAE)
35405 NODE_NAME_CASE(CVTTP2UIS)
35406 NODE_NAME_CASE(MCVTTP2UIS)
35407 NODE_NAME_CASE(POP_FROM_X87_REG)
35408 }
35409 return nullptr;
35410#undef NODE_NAME_CASE
35411}
35412
35413/// Return true if the addressing mode represented by AM is legal for this
35414/// target, for a load/store of the specified type.
35416 const AddrMode &AM, Type *Ty,
35417 unsigned AS,
35418 Instruction *I) const {
35419 // X86 supports extremely general addressing modes.
35421
35422 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35423 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35424 return false;
35425
35426 if (AM.BaseGV) {
35427 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35428
35429 // If a reference to this global requires an extra load, we can't fold it.
35430 if (isGlobalStubReference(GVFlags))
35431 return false;
35432
35433 // If BaseGV requires a register for the PIC base, we cannot also have a
35434 // BaseReg specified.
35435 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35436 return false;
35437
35438 // If lower 4G is not available, then we must use rip-relative addressing.
35439 if ((M != CodeModel::Small || isPositionIndependent()) &&
35440 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35441 return false;
35442 }
35443
35444 switch (AM.Scale) {
35445 case 0:
35446 case 1:
35447 case 2:
35448 case 4:
35449 case 8:
35450 // These scales always work.
35451 break;
35452 case 3:
35453 case 5:
35454 case 9:
35455 // These scales are formed with basereg+scalereg. Only accept if there is
35456 // no basereg yet.
35457 if (AM.HasBaseReg)
35458 return false;
35459 break;
35460 default: // Other stuff never works.
35461 return false;
35462 }
35463
35464 return true;
35465}
35466
35467bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35468 switch (Opcode) {
35469 // These are non-commutative binops.
35470 // TODO: Add more X86ISD opcodes once we have test coverage.
35471 case X86ISD::ANDNP:
35472 case X86ISD::PCMPGT:
35473 case X86ISD::FMAX:
35474 case X86ISD::FMIN:
35475 case X86ISD::FANDN:
35476 case X86ISD::VPSHA:
35477 case X86ISD::VPSHL:
35478 case X86ISD::VSHLV:
35479 case X86ISD::VSRLV:
35480 case X86ISD::VSRAV:
35481 return true;
35482 }
35483
35484 return TargetLoweringBase::isBinOp(Opcode);
35485}
35486
35487bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35488 switch (Opcode) {
35489 // TODO: Add more X86ISD opcodes once we have test coverage.
35490 case X86ISD::PCMPEQ:
35491 case X86ISD::PMULDQ:
35492 case X86ISD::PMULUDQ:
35493 case X86ISD::FMAXC:
35494 case X86ISD::FMINC:
35495 case X86ISD::FAND:
35496 case X86ISD::FOR:
35497 case X86ISD::FXOR:
35498 return true;
35499 }
35500
35502}
35503
35505 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35506 return false;
35507 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35508 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35509 return NumBits1 > NumBits2;
35510}
35511
35513 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35514 return false;
35515
35516 if (!isTypeLegal(EVT::getEVT(Ty1)))
35517 return false;
35518
35519 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35520
35521 // Assuming the caller doesn't have a zeroext or signext return parameter,
35522 // truncation all the way down to i1 is valid.
35523 return true;
35524}
35525
35527 return isInt<32>(Imm);
35528}
35529
35531 // Can also use sub to handle negated immediates.
35532 return isInt<32>(Imm);
35533}
35534
35536 return isInt<32>(Imm);
35537}
35538
35540 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35541 return false;
35542 unsigned NumBits1 = VT1.getSizeInBits();
35543 unsigned NumBits2 = VT2.getSizeInBits();
35544 return NumBits1 > NumBits2;
35545}
35546
35548 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35549 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35550}
35551
35553 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35554 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35555}
35556
35558 EVT VT1 = Val.getValueType();
35559 if (isZExtFree(VT1, VT2))
35560 return true;
35561
35562 if (Val.getOpcode() != ISD::LOAD)
35563 return false;
35564
35565 if (!VT1.isSimple() || !VT1.isInteger() ||
35566 !VT2.isSimple() || !VT2.isInteger())
35567 return false;
35568
35569 switch (VT1.getSimpleVT().SimpleTy) {
35570 default: break;
35571 case MVT::i8:
35572 case MVT::i16:
35573 case MVT::i32:
35574 // X86 has 8, 16, and 32-bit zero-extending loads.
35575 return true;
35576 }
35577
35578 return false;
35579}
35580
35582 if (!Subtarget.is64Bit())
35583 return false;
35585}
35586
35588 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35589 return false;
35590
35591 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35592
35593 // There is no extending load for vXi1.
35594 if (SrcVT.getScalarType() == MVT::i1)
35595 return false;
35596
35597 return true;
35598}
35599
35601 EVT VT) const {
35602 if (Subtarget.useSoftFloat())
35603 return false;
35604
35605 if (!Subtarget.hasAnyFMA())
35606 return false;
35607
35608 VT = VT.getScalarType();
35609
35610 if (!VT.isSimple())
35611 return false;
35612
35613 switch (VT.getSimpleVT().SimpleTy) {
35614 case MVT::f16:
35615 return Subtarget.hasFP16();
35616 case MVT::f32:
35617 case MVT::f64:
35618 return true;
35619 default:
35620 break;
35621 }
35622
35623 return false;
35624}
35625
35627 EVT DestVT) const {
35628 // i16 instructions are longer (0x66 prefix) and potentially slower.
35629 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35630}
35631
35633 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35634 SDValue Y) const {
35635 if (SelectOpcode == ISD::SELECT) {
35636 if (VT.isVector())
35637 return false;
35638 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35639 return false;
35640 using namespace llvm::SDPatternMatch;
35641 // BLSI
35642 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35643 sd_match(X, m_Neg(m_Specific(Y)))))
35644 return true;
35645 // BLSR
35646 if (BinOpcode == ISD::AND &&
35647 (sd_match(Y, m_Add(m_Specific(X), m_AllOnes())) ||
35648 sd_match(X, m_Add(m_Specific(Y), m_AllOnes()))))
35649 return true;
35650 // BLSMSK
35651 if (BinOpcode == ISD::XOR &&
35652 (sd_match(Y, m_Add(m_Specific(X), m_AllOnes())) ||
35653 sd_match(X, m_Add(m_Specific(Y), m_AllOnes()))))
35654 return true;
35655
35656 return false;
35657 }
35658 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35659 // benefit. The transform may also be profitable for scalar code.
35660 if (!Subtarget.hasAVX512())
35661 return false;
35662 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35663 return false;
35664 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35665 return false;
35666
35667 return true;
35668}
35669
35670/// Targets can use this to indicate that they only support *some*
35671/// VECTOR_SHUFFLE operations, those with specific masks.
35672/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35673/// are assumed to be legal.
35675 if (!VT.isSimple())
35676 return false;
35677
35678 // Not for i1 vectors
35679 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35680 return false;
35681
35682 // Very little shuffling can be done for 64-bit vectors right now.
35683 if (VT.getSimpleVT().getSizeInBits() == 64)
35684 return false;
35685
35686 // We only care that the types being shuffled are legal. The lowering can
35687 // handle any possible shuffle mask that results.
35688 return isTypeLegal(VT.getSimpleVT());
35689}
35690
35692 EVT VT) const {
35693 // Don't convert an 'and' into a shuffle that we don't directly support.
35694 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35695 if (!Subtarget.hasAVX2())
35696 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35697 return false;
35698
35699 // Just delegate to the generic legality, clear masks aren't special.
35700 return isShuffleMaskLegal(Mask, VT);
35701}
35702
35704 // If the subtarget is using thunks, we need to not generate jump tables.
35705 if (Subtarget.useIndirectThunkBranches())
35706 return false;
35707
35708 // Otherwise, fallback on the generic logic.
35710}
35711
35713 EVT ConditionVT) const {
35714 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35715 // zero-extensions.
35716 if (ConditionVT.getSizeInBits() < 32)
35717 return MVT::i32;
35719 ConditionVT);
35720}
35721
35722//===----------------------------------------------------------------------===//
35723// X86 Scheduler Hooks
35724//===----------------------------------------------------------------------===//
35725
35726/// Utility function to emit xbegin specifying the start of an RTM region.
35728 const TargetInstrInfo *TII) {
35729 const MIMetadata MIMD(MI);
35730
35731 const BasicBlock *BB = MBB->getBasicBlock();
35733
35734 // For the v = xbegin(), we generate
35735 //
35736 // thisMBB:
35737 // xbegin sinkMBB
35738 //
35739 // mainMBB:
35740 // s0 = -1
35741 //
35742 // fallBB:
35743 // eax = # XABORT_DEF
35744 // s1 = eax
35745 //
35746 // sinkMBB:
35747 // v = phi(s0/mainBB, s1/fallBB)
35748
35749 MachineBasicBlock *thisMBB = MBB;
35750 MachineFunction *MF = MBB->getParent();
35751 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35752 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35753 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35754 MF->insert(I, mainMBB);
35755 MF->insert(I, fallMBB);
35756 MF->insert(I, sinkMBB);
35757
35758 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35759 mainMBB->addLiveIn(X86::EFLAGS);
35760 fallMBB->addLiveIn(X86::EFLAGS);
35761 sinkMBB->addLiveIn(X86::EFLAGS);
35762 }
35763
35764 // Transfer the remainder of BB and its successor edges to sinkMBB.
35765 sinkMBB->splice(sinkMBB->begin(), MBB,
35766 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35768
35770 Register DstReg = MI.getOperand(0).getReg();
35771 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35772 Register mainDstReg = MRI.createVirtualRegister(RC);
35773 Register fallDstReg = MRI.createVirtualRegister(RC);
35774
35775 // thisMBB:
35776 // xbegin fallMBB
35777 // # fallthrough to mainMBB
35778 // # abortion to fallMBB
35779 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35780 thisMBB->addSuccessor(mainMBB);
35781 thisMBB->addSuccessor(fallMBB);
35782
35783 // mainMBB:
35784 // mainDstReg := -1
35785 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35786 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35787 mainMBB->addSuccessor(sinkMBB);
35788
35789 // fallMBB:
35790 // ; pseudo instruction to model hardware's definition from XABORT
35791 // EAX := XABORT_DEF
35792 // fallDstReg := EAX
35793 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35794 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35795 .addReg(X86::EAX);
35796 fallMBB->addSuccessor(sinkMBB);
35797
35798 // sinkMBB:
35799 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35800 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35801 .addReg(mainDstReg).addMBB(mainMBB)
35802 .addReg(fallDstReg).addMBB(fallMBB);
35803
35804 MI.eraseFromParent();
35805 return sinkMBB;
35806}
35807
35809X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35810 MachineBasicBlock *MBB) const {
35811 // Emit va_arg instruction on X86-64.
35812
35813 // Operands to this pseudo-instruction:
35814 // 0 ) Output : destination address (reg)
35815 // 1-5) Input : va_list address (addr, i64mem)
35816 // 6 ) ArgSize : Size (in bytes) of vararg type
35817 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35818 // 8 ) Align : Alignment of type
35819 // 9 ) EFLAGS (implicit-def)
35820
35821 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35822 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35823
35824 Register DestReg = MI.getOperand(0).getReg();
35825 MachineOperand &Base = MI.getOperand(1);
35826 MachineOperand &Scale = MI.getOperand(2);
35827 MachineOperand &Index = MI.getOperand(3);
35828 MachineOperand &Disp = MI.getOperand(4);
35829 MachineOperand &Segment = MI.getOperand(5);
35830 unsigned ArgSize = MI.getOperand(6).getImm();
35831 unsigned ArgMode = MI.getOperand(7).getImm();
35832 Align Alignment = Align(MI.getOperand(8).getImm());
35833
35834 MachineFunction *MF = MBB->getParent();
35835
35836 // Memory Reference
35837 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35838
35839 MachineMemOperand *OldMMO = MI.memoperands().front();
35840
35841 // Clone the MMO into two separate MMOs for loading and storing
35842 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35843 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35844 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35845 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35846
35847 // Machine Information
35848 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35850 const TargetRegisterClass *AddrRegClass =
35852 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35853 const MIMetadata MIMD(MI);
35854
35855 // struct va_list {
35856 // i32 gp_offset
35857 // i32 fp_offset
35858 // i64 overflow_area (address)
35859 // i64 reg_save_area (address)
35860 // }
35861 // sizeof(va_list) = 24
35862 // alignment(va_list) = 8
35863
35864 unsigned TotalNumIntRegs = 6;
35865 unsigned TotalNumXMMRegs = 8;
35866 bool UseGPOffset = (ArgMode == 1);
35867 bool UseFPOffset = (ArgMode == 2);
35868 unsigned MaxOffset = TotalNumIntRegs * 8 +
35869 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35870
35871 /* Align ArgSize to a multiple of 8 */
35872 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35873 bool NeedsAlign = (Alignment > 8);
35874
35875 MachineBasicBlock *thisMBB = MBB;
35876 MachineBasicBlock *overflowMBB;
35877 MachineBasicBlock *offsetMBB;
35878 MachineBasicBlock *endMBB;
35879
35880 Register OffsetDestReg; // Argument address computed by offsetMBB
35881 Register OverflowDestReg; // Argument address computed by overflowMBB
35882 Register OffsetReg;
35883
35884 if (!UseGPOffset && !UseFPOffset) {
35885 // If we only pull from the overflow region, we don't create a branch.
35886 // We don't need to alter control flow.
35887 OffsetDestReg = Register(); // unused
35888 OverflowDestReg = DestReg;
35889
35890 offsetMBB = nullptr;
35891 overflowMBB = thisMBB;
35892 endMBB = thisMBB;
35893 } else {
35894 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35895 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35896 // If not, pull from overflow_area. (branch to overflowMBB)
35897 //
35898 // thisMBB
35899 // | .
35900 // | .
35901 // offsetMBB overflowMBB
35902 // | .
35903 // | .
35904 // endMBB
35905
35906 // Registers for the PHI in endMBB
35907 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35908 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35909
35910 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35911 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35912 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35913 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35914
35916
35917 // Insert the new basic blocks
35918 MF->insert(MBBIter, offsetMBB);
35919 MF->insert(MBBIter, overflowMBB);
35920 MF->insert(MBBIter, endMBB);
35921
35922 // Transfer the remainder of MBB and its successor edges to endMBB.
35923 endMBB->splice(endMBB->begin(), thisMBB,
35924 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35925 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35926
35927 // Make offsetMBB and overflowMBB successors of thisMBB
35928 thisMBB->addSuccessor(offsetMBB);
35929 thisMBB->addSuccessor(overflowMBB);
35930
35931 // endMBB is a successor of both offsetMBB and overflowMBB
35932 offsetMBB->addSuccessor(endMBB);
35933 overflowMBB->addSuccessor(endMBB);
35934
35935 // Load the offset value into a register
35936 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35937 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35938 .add(Base)
35939 .add(Scale)
35940 .add(Index)
35941 .addDisp(Disp, UseFPOffset ? 4 : 0)
35942 .add(Segment)
35943 .setMemRefs(LoadOnlyMMO);
35944
35945 // Check if there is enough room left to pull this argument.
35946 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35947 .addReg(OffsetReg)
35948 .addImm(MaxOffset + 8 - ArgSizeA8);
35949
35950 // Branch to "overflowMBB" if offset >= max
35951 // Fall through to "offsetMBB" otherwise
35952 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35953 .addMBB(overflowMBB).addImm(X86::COND_AE);
35954 }
35955
35956 // In offsetMBB, emit code to use the reg_save_area.
35957 if (offsetMBB) {
35958 assert(OffsetReg != 0);
35959
35960 // Read the reg_save_area address.
35961 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35962 BuildMI(
35963 offsetMBB, MIMD,
35964 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35965 RegSaveReg)
35966 .add(Base)
35967 .add(Scale)
35968 .add(Index)
35969 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35970 .add(Segment)
35971 .setMemRefs(LoadOnlyMMO);
35972
35973 if (Subtarget.isTarget64BitLP64()) {
35974 // Zero-extend the offset
35975 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35976 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35977 .addImm(0)
35978 .addReg(OffsetReg)
35979 .addImm(X86::sub_32bit);
35980
35981 // Add the offset to the reg_save_area to get the final address.
35982 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35983 .addReg(OffsetReg64)
35984 .addReg(RegSaveReg);
35985 } else {
35986 // Add the offset to the reg_save_area to get the final address.
35987 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35988 .addReg(OffsetReg)
35989 .addReg(RegSaveReg);
35990 }
35991
35992 // Compute the offset for the next argument
35993 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35994 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35995 .addReg(OffsetReg)
35996 .addImm(UseFPOffset ? 16 : 8);
35997
35998 // Store it back into the va_list.
35999 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36000 .add(Base)
36001 .add(Scale)
36002 .add(Index)
36003 .addDisp(Disp, UseFPOffset ? 4 : 0)
36004 .add(Segment)
36005 .addReg(NextOffsetReg)
36006 .setMemRefs(StoreOnlyMMO);
36007
36008 // Jump to endMBB
36009 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36010 .addMBB(endMBB);
36011 }
36012
36013 //
36014 // Emit code to use overflow area
36015 //
36016
36017 // Load the overflow_area address into a register.
36018 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36019 BuildMI(overflowMBB, MIMD,
36020 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36021 OverflowAddrReg)
36022 .add(Base)
36023 .add(Scale)
36024 .add(Index)
36025 .addDisp(Disp, 8)
36026 .add(Segment)
36027 .setMemRefs(LoadOnlyMMO);
36028
36029 // If we need to align it, do so. Otherwise, just copy the address
36030 // to OverflowDestReg.
36031 if (NeedsAlign) {
36032 // Align the overflow address
36033 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36034
36035 // aligned_addr = (addr + (align-1)) & ~(align-1)
36036 BuildMI(
36037 overflowMBB, MIMD,
36038 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36039 TmpReg)
36040 .addReg(OverflowAddrReg)
36041 .addImm(Alignment.value() - 1);
36042
36043 BuildMI(
36044 overflowMBB, MIMD,
36045 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36046 OverflowDestReg)
36047 .addReg(TmpReg)
36048 .addImm(~(uint64_t)(Alignment.value() - 1));
36049 } else {
36050 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36051 .addReg(OverflowAddrReg);
36052 }
36053
36054 // Compute the next overflow address after this argument.
36055 // (the overflow address should be kept 8-byte aligned)
36056 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36057 BuildMI(
36058 overflowMBB, MIMD,
36059 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36060 NextAddrReg)
36061 .addReg(OverflowDestReg)
36062 .addImm(ArgSizeA8);
36063
36064 // Store the new overflow address.
36065 BuildMI(overflowMBB, MIMD,
36066 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36067 .add(Base)
36068 .add(Scale)
36069 .add(Index)
36070 .addDisp(Disp, 8)
36071 .add(Segment)
36072 .addReg(NextAddrReg)
36073 .setMemRefs(StoreOnlyMMO);
36074
36075 // If we branched, emit the PHI to the front of endMBB.
36076 if (offsetMBB) {
36077 BuildMI(*endMBB, endMBB->begin(), MIMD,
36078 TII->get(X86::PHI), DestReg)
36079 .addReg(OffsetDestReg).addMBB(offsetMBB)
36080 .addReg(OverflowDestReg).addMBB(overflowMBB);
36081 }
36082
36083 // Erase the pseudo instruction
36084 MI.eraseFromParent();
36085
36086 return endMBB;
36087}
36088
36089// The EFLAGS operand of SelectItr might be missing a kill marker
36090// because there were multiple uses of EFLAGS, and ISel didn't know
36091// which to mark. Figure out whether SelectItr should have had a
36092// kill marker, and set it if it should. Returns the correct kill
36093// marker value.
36096 const TargetRegisterInfo* TRI) {
36097 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36098 return false;
36099
36100 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36101 // out. SelectMI should have a kill flag on EFLAGS.
36102 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36103 return true;
36104}
36105
36106// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36107// together with other CMOV pseudo-opcodes into a single basic-block with
36108// conditional jump around it.
36110 switch (MI.getOpcode()) {
36111 case X86::CMOV_FR16:
36112 case X86::CMOV_FR16X:
36113 case X86::CMOV_FR32:
36114 case X86::CMOV_FR32X:
36115 case X86::CMOV_FR64:
36116 case X86::CMOV_FR64X:
36117 case X86::CMOV_GR8:
36118 case X86::CMOV_GR16:
36119 case X86::CMOV_GR32:
36120 case X86::CMOV_RFP32:
36121 case X86::CMOV_RFP64:
36122 case X86::CMOV_RFP80:
36123 case X86::CMOV_VR64:
36124 case X86::CMOV_VR128:
36125 case X86::CMOV_VR128X:
36126 case X86::CMOV_VR256:
36127 case X86::CMOV_VR256X:
36128 case X86::CMOV_VR512:
36129 case X86::CMOV_VK1:
36130 case X86::CMOV_VK2:
36131 case X86::CMOV_VK4:
36132 case X86::CMOV_VK8:
36133 case X86::CMOV_VK16:
36134 case X86::CMOV_VK32:
36135 case X86::CMOV_VK64:
36136 return true;
36137
36138 default:
36139 return false;
36140 }
36141}
36142
36143// Helper function, which inserts PHI functions into SinkMBB:
36144// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36145// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36146// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36147// the last PHI function inserted.
36150 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36151 MachineBasicBlock *SinkMBB) {
36152 MachineFunction *MF = TrueMBB->getParent();
36154 const MIMetadata MIMD(*MIItBegin);
36155
36156 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36158
36159 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36160
36161 // As we are creating the PHIs, we have to be careful if there is more than
36162 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36163 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36164 // That also means that PHI construction must work forward from earlier to
36165 // later, and that the code must maintain a mapping from earlier PHI's
36166 // destination registers, and the registers that went into the PHI.
36169
36170 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36171 Register DestReg = MIIt->getOperand(0).getReg();
36172 Register Op1Reg = MIIt->getOperand(1).getReg();
36173 Register Op2Reg = MIIt->getOperand(2).getReg();
36174
36175 // If this CMOV we are generating is the opposite condition from
36176 // the jump we generated, then we have to swap the operands for the
36177 // PHI that is going to be generated.
36178 if (MIIt->getOperand(3).getImm() == OppCC)
36179 std::swap(Op1Reg, Op2Reg);
36180
36181 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36182 Op1Reg = It->second.first;
36183
36184 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36185 Op2Reg = It->second.second;
36186
36187 MIB =
36188 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36189 .addReg(Op1Reg)
36190 .addMBB(FalseMBB)
36191 .addReg(Op2Reg)
36192 .addMBB(TrueMBB);
36193
36194 // Add this PHI to the rewrite table.
36195 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36196 }
36197
36198 return MIB;
36199}
36200
36201// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36203X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36204 MachineInstr &SecondCascadedCMOV,
36205 MachineBasicBlock *ThisMBB) const {
36206 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36207 const MIMetadata MIMD(FirstCMOV);
36208
36209 // We lower cascaded CMOVs such as
36210 //
36211 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36212 //
36213 // to two successive branches.
36214 //
36215 // Without this, we would add a PHI between the two jumps, which ends up
36216 // creating a few copies all around. For instance, for
36217 //
36218 // (sitofp (zext (fcmp une)))
36219 //
36220 // we would generate:
36221 //
36222 // ucomiss %xmm1, %xmm0
36223 // movss <1.0f>, %xmm0
36224 // movaps %xmm0, %xmm1
36225 // jne .LBB5_2
36226 // xorps %xmm1, %xmm1
36227 // .LBB5_2:
36228 // jp .LBB5_4
36229 // movaps %xmm1, %xmm0
36230 // .LBB5_4:
36231 // retq
36232 //
36233 // because this custom-inserter would have generated:
36234 //
36235 // A
36236 // | \
36237 // | B
36238 // | /
36239 // C
36240 // | \
36241 // | D
36242 // | /
36243 // E
36244 //
36245 // A: X = ...; Y = ...
36246 // B: empty
36247 // C: Z = PHI [X, A], [Y, B]
36248 // D: empty
36249 // E: PHI [X, C], [Z, D]
36250 //
36251 // If we lower both CMOVs in a single step, we can instead generate:
36252 //
36253 // A
36254 // | \
36255 // | C
36256 // | /|
36257 // |/ |
36258 // | |
36259 // | D
36260 // | /
36261 // E
36262 //
36263 // A: X = ...; Y = ...
36264 // D: empty
36265 // E: PHI [X, A], [X, C], [Y, D]
36266 //
36267 // Which, in our sitofp/fcmp example, gives us something like:
36268 //
36269 // ucomiss %xmm1, %xmm0
36270 // movss <1.0f>, %xmm0
36271 // jne .LBB5_4
36272 // jp .LBB5_4
36273 // xorps %xmm0, %xmm0
36274 // .LBB5_4:
36275 // retq
36276 //
36277
36278 // We lower cascaded CMOV into two successive branches to the same block.
36279 // EFLAGS is used by both, so mark it as live in the second.
36280 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36281 MachineFunction *F = ThisMBB->getParent();
36282 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36283 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36284 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36285
36286 MachineFunction::iterator It = ++ThisMBB->getIterator();
36287 F->insert(It, FirstInsertedMBB);
36288 F->insert(It, SecondInsertedMBB);
36289 F->insert(It, SinkMBB);
36290
36291 // For a cascaded CMOV, we lower it to two successive branches to
36292 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36293 // the FirstInsertedMBB.
36294 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36295
36296 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36297 // live into the sink and copy blocks.
36298 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36299 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36300 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36301 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36302 SinkMBB->addLiveIn(X86::EFLAGS);
36303 }
36304
36305 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36306 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36307 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36308 ThisMBB->end());
36309 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36310
36311 // Fallthrough block for ThisMBB.
36312 ThisMBB->addSuccessor(FirstInsertedMBB);
36313 // The true block target of the first branch is always SinkMBB.
36314 ThisMBB->addSuccessor(SinkMBB);
36315 // Fallthrough block for FirstInsertedMBB.
36316 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36317 // The true block for the branch of FirstInsertedMBB.
36318 FirstInsertedMBB->addSuccessor(SinkMBB);
36319 // This is fallthrough.
36320 SecondInsertedMBB->addSuccessor(SinkMBB);
36321
36322 // Create the conditional branch instructions.
36323 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36324 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36325
36326 X86::CondCode SecondCC =
36327 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36328 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36329 .addMBB(SinkMBB)
36330 .addImm(SecondCC);
36331
36332 // SinkMBB:
36333 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36334 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36335 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36336 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36338 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36339 .addReg(Op1Reg)
36340 .addMBB(SecondInsertedMBB)
36341 .addReg(Op2Reg)
36342 .addMBB(ThisMBB);
36343
36344 // The second SecondInsertedMBB provides the same incoming value as the
36345 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36346 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36347
36348 // Now remove the CMOVs.
36349 FirstCMOV.eraseFromParent();
36350 SecondCascadedCMOV.eraseFromParent();
36351
36352 return SinkMBB;
36353}
36354
36356X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36357 MachineBasicBlock *ThisMBB) const {
36358 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36359 const MIMetadata MIMD(MI);
36360
36361 // To "insert" a SELECT_CC instruction, we actually have to insert the
36362 // diamond control-flow pattern. The incoming instruction knows the
36363 // destination vreg to set, the condition code register to branch on, the
36364 // true/false values to select between and a branch opcode to use.
36365
36366 // ThisMBB:
36367 // ...
36368 // TrueVal = ...
36369 // cmpTY ccX, r1, r2
36370 // bCC copy1MBB
36371 // fallthrough --> FalseMBB
36372
36373 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36374 // as described above, by inserting a BB, and then making a PHI at the join
36375 // point to select the true and false operands of the CMOV in the PHI.
36376 //
36377 // The code also handles two different cases of multiple CMOV opcodes
36378 // in a row.
36379 //
36380 // Case 1:
36381 // In this case, there are multiple CMOVs in a row, all which are based on
36382 // the same condition setting (or the exact opposite condition setting).
36383 // In this case we can lower all the CMOVs using a single inserted BB, and
36384 // then make a number of PHIs at the join point to model the CMOVs. The only
36385 // trickiness here, is that in a case like:
36386 //
36387 // t2 = CMOV cond1 t1, f1
36388 // t3 = CMOV cond1 t2, f2
36389 //
36390 // when rewriting this into PHIs, we have to perform some renaming on the
36391 // temps since you cannot have a PHI operand refer to a PHI result earlier
36392 // in the same block. The "simple" but wrong lowering would be:
36393 //
36394 // t2 = PHI t1(BB1), f1(BB2)
36395 // t3 = PHI t2(BB1), f2(BB2)
36396 //
36397 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36398 // renaming is to note that on the path through BB1, t2 is really just a
36399 // copy of t1, and do that renaming, properly generating:
36400 //
36401 // t2 = PHI t1(BB1), f1(BB2)
36402 // t3 = PHI t1(BB1), f2(BB2)
36403 //
36404 // Case 2:
36405 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36406 // function - EmitLoweredCascadedSelect.
36407
36408 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36410 MachineInstr *LastCMOV = &MI;
36412
36413 // Check for case 1, where there are multiple CMOVs with the same condition
36414 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36415 // number of jumps the most.
36416
36417 if (isCMOVPseudo(MI)) {
36418 // See if we have a string of CMOVS with the same condition. Skip over
36419 // intervening debug insts.
36420 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36421 (NextMIIt->getOperand(3).getImm() == CC ||
36422 NextMIIt->getOperand(3).getImm() == OppCC)) {
36423 LastCMOV = &*NextMIIt;
36424 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36425 }
36426 }
36427
36428 // This checks for case 2, but only do this if we didn't already find
36429 // case 1, as indicated by LastCMOV == MI.
36430 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36431 NextMIIt->getOpcode() == MI.getOpcode() &&
36432 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36433 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36434 NextMIIt->getOperand(1).isKill()) {
36435 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36436 }
36437
36438 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36439 MachineFunction *F = ThisMBB->getParent();
36440 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36441 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36442
36443 MachineFunction::iterator It = ++ThisMBB->getIterator();
36444 F->insert(It, FalseMBB);
36445 F->insert(It, SinkMBB);
36446
36447 // Set the call frame size on entry to the new basic blocks.
36448 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36449 FalseMBB->setCallFrameSize(CallFrameSize);
36450 SinkMBB->setCallFrameSize(CallFrameSize);
36451
36452 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36453 // live into the sink and copy blocks.
36454 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36455 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36456 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36457 FalseMBB->addLiveIn(X86::EFLAGS);
36458 SinkMBB->addLiveIn(X86::EFLAGS);
36459 }
36460
36461 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36463 MachineBasicBlock::iterator(LastCMOV));
36464 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36465 if (MI.isDebugInstr())
36466 SinkMBB->push_back(MI.removeFromParent());
36467
36468 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36469 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36470 std::next(MachineBasicBlock::iterator(LastCMOV)),
36471 ThisMBB->end());
36472 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36473
36474 // Fallthrough block for ThisMBB.
36475 ThisMBB->addSuccessor(FalseMBB);
36476 // The true block target of the first (or only) branch is always a SinkMBB.
36477 ThisMBB->addSuccessor(SinkMBB);
36478 // Fallthrough block for FalseMBB.
36479 FalseMBB->addSuccessor(SinkMBB);
36480
36481 // Create the conditional branch instruction.
36482 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36483
36484 // SinkMBB:
36485 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36486 // ...
36489 std::next(MachineBasicBlock::iterator(LastCMOV));
36490 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36491
36492 // Now remove the CMOV(s).
36493 ThisMBB->erase(MIItBegin, MIItEnd);
36494
36495 return SinkMBB;
36496}
36497
36498static unsigned getSUBriOpcode(bool IsLP64) {
36499 if (IsLP64)
36500 return X86::SUB64ri32;
36501 else
36502 return X86::SUB32ri;
36503}
36504
36506X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36507 MachineBasicBlock *MBB) const {
36508 MachineFunction *MF = MBB->getParent();
36509 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36510 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36511 const MIMetadata MIMD(MI);
36512 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36513
36514 const unsigned ProbeSize = getStackProbeSize(*MF);
36515
36517 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36518 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36519 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36520
36522 MF->insert(MBBIter, testMBB);
36523 MF->insert(MBBIter, blockMBB);
36524 MF->insert(MBBIter, tailMBB);
36525
36526 Register sizeVReg = MI.getOperand(1).getReg();
36527
36528 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36529
36530 Register TmpStackPtr = MRI.createVirtualRegister(
36531 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36532 Register FinalStackPtr = MRI.createVirtualRegister(
36533 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36534
36535 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36536 .addReg(physSPReg);
36537 {
36538 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36539 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36540 .addReg(TmpStackPtr)
36541 .addReg(sizeVReg);
36542 }
36543
36544 // test rsp size
36545
36546 BuildMI(testMBB, MIMD,
36547 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36548 .addReg(FinalStackPtr)
36549 .addReg(physSPReg);
36550
36551 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36552 .addMBB(tailMBB)
36554 testMBB->addSuccessor(blockMBB);
36555 testMBB->addSuccessor(tailMBB);
36556
36557 // Touch the block then extend it. This is done on the opposite side of
36558 // static probe where we allocate then touch, to avoid the need of probing the
36559 // tail of the static alloca. Possible scenarios are:
36560 //
36561 // + ---- <- ------------ <- ------------- <- ------------ +
36562 // | |
36563 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36564 // | |
36565 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36566 //
36567 // The property we want to enforce is to never have more than [page alloc] between two probes.
36568
36569 const unsigned XORMIOpc =
36570 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36571 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36572 .addImm(0);
36573
36574 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36575 physSPReg)
36576 .addReg(physSPReg)
36577 .addImm(ProbeSize);
36578
36579 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36580 blockMBB->addSuccessor(testMBB);
36581
36582 // Replace original instruction by the expected stack ptr
36583 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36584 MI.getOperand(0).getReg())
36585 .addReg(FinalStackPtr);
36586
36587 tailMBB->splice(tailMBB->end(), MBB,
36588 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36590 MBB->addSuccessor(testMBB);
36591
36592 // Delete the original pseudo instruction.
36593 MI.eraseFromParent();
36594
36595 // And we're done.
36596 return tailMBB;
36597}
36598
36600X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36601 MachineBasicBlock *BB) const {
36602 MachineFunction *MF = BB->getParent();
36603 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36604 const MIMetadata MIMD(MI);
36605 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36606
36607 assert(MF->shouldSplitStack());
36608
36609 const bool Is64Bit = Subtarget.is64Bit();
36610 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36611
36612 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36613 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36614
36615 // BB:
36616 // ... [Till the alloca]
36617 // If stacklet is not large enough, jump to mallocMBB
36618 //
36619 // bumpMBB:
36620 // Allocate by subtracting from RSP
36621 // Jump to continueMBB
36622 //
36623 // mallocMBB:
36624 // Allocate by call to runtime
36625 //
36626 // continueMBB:
36627 // ...
36628 // [rest of original BB]
36629 //
36630
36631 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36632 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36633 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36634
36636 const TargetRegisterClass *AddrRegClass =
36638
36639 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36640 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36641 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36642 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36643 sizeVReg = MI.getOperand(1).getReg(),
36644 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36645
36646 MachineFunction::iterator MBBIter = ++BB->getIterator();
36647
36648 MF->insert(MBBIter, bumpMBB);
36649 MF->insert(MBBIter, mallocMBB);
36650 MF->insert(MBBIter, continueMBB);
36651
36652 continueMBB->splice(continueMBB->begin(), BB,
36653 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36654 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36655
36656 // Add code to the main basic block to check if the stack limit has been hit,
36657 // and if so, jump to mallocMBB otherwise to bumpMBB.
36658 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36659 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36660 .addReg(tmpSPVReg).addReg(sizeVReg);
36661 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36662 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36663 .addReg(SPLimitVReg);
36664 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36665
36666 // bumpMBB simply decreases the stack pointer, since we know the current
36667 // stacklet has enough space.
36668 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36669 .addReg(SPLimitVReg);
36670 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36671 .addReg(SPLimitVReg);
36672 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36673
36674 // Calls into a routine in libgcc to allocate more space from the heap.
36675 const uint32_t *RegMask =
36677 if (IsLP64) {
36678 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36679 .addReg(sizeVReg);
36680 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36681 .addExternalSymbol("__morestack_allocate_stack_space")
36682 .addRegMask(RegMask)
36683 .addReg(X86::RDI, RegState::Implicit)
36684 .addReg(X86::RAX, RegState::ImplicitDefine);
36685 } else if (Is64Bit) {
36686 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36687 .addReg(sizeVReg);
36688 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36689 .addExternalSymbol("__morestack_allocate_stack_space")
36690 .addRegMask(RegMask)
36691 .addReg(X86::EDI, RegState::Implicit)
36692 .addReg(X86::EAX, RegState::ImplicitDefine);
36693 } else {
36694 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36695 .addImm(12);
36696 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36697 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36698 .addExternalSymbol("__morestack_allocate_stack_space")
36699 .addRegMask(RegMask)
36700 .addReg(X86::EAX, RegState::ImplicitDefine);
36701 }
36702
36703 if (!Is64Bit)
36704 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36705 .addImm(16);
36706
36707 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36708 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36709 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36710
36711 // Set up the CFG correctly.
36712 BB->addSuccessor(bumpMBB);
36713 BB->addSuccessor(mallocMBB);
36714 mallocMBB->addSuccessor(continueMBB);
36715 bumpMBB->addSuccessor(continueMBB);
36716
36717 // Take care of the PHI nodes.
36718 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36719 MI.getOperand(0).getReg())
36720 .addReg(mallocPtrVReg)
36721 .addMBB(mallocMBB)
36722 .addReg(bumpSPPtrVReg)
36723 .addMBB(bumpMBB);
36724
36725 // Delete the original pseudo instruction.
36726 MI.eraseFromParent();
36727
36728 // And we're done.
36729 return continueMBB;
36730}
36731
36733X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36734 MachineBasicBlock *BB) const {
36735 MachineFunction *MF = BB->getParent();
36736 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36737 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36738 const MIMetadata MIMD(MI);
36739
36742 "SEH does not use catchret!");
36743
36744 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36745 if (!Subtarget.is32Bit())
36746 return BB;
36747
36748 // C++ EH creates a new target block to hold the restore code, and wires up
36749 // the new block to the return destination with a normal JMP_4.
36750 MachineBasicBlock *RestoreMBB =
36752 assert(BB->succ_size() == 1);
36753 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36754 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36755 BB->addSuccessor(RestoreMBB);
36756 MI.getOperand(0).setMBB(RestoreMBB);
36757
36758 // Marking this as an EH pad but not a funclet entry block causes PEI to
36759 // restore stack pointers in the block.
36760 RestoreMBB->setIsEHPad(true);
36761
36762 auto RestoreMBBI = RestoreMBB->begin();
36763 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36764 return BB;
36765}
36766
36768X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36769 MachineBasicBlock *BB) const {
36770 // This is pretty easy. We're taking the value that we received from
36771 // our load from the relocation, sticking it in either RDI (x86-64)
36772 // or EAX and doing an indirect call. The return value will then
36773 // be in the normal return register.
36774 MachineFunction *F = BB->getParent();
36775 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36776 const MIMetadata MIMD(MI);
36777
36778 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36779 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36780
36781 // Get a register mask for the lowered call.
36782 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36783 // proper register mask.
36784 const uint32_t *RegMask =
36785 Subtarget.is64Bit() ?
36788 if (Subtarget.is64Bit()) {
36790 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36791 .addReg(X86::RIP)
36792 .addImm(0)
36793 .addReg(0)
36794 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36795 MI.getOperand(3).getTargetFlags())
36796 .addReg(0);
36797 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36798 addDirectMem(MIB, X86::RDI);
36799 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36800 } else if (!isPositionIndependent()) {
36802 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36803 .addReg(0)
36804 .addImm(0)
36805 .addReg(0)
36806 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36807 MI.getOperand(3).getTargetFlags())
36808 .addReg(0);
36809 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36810 addDirectMem(MIB, X86::EAX);
36811 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36812 } else {
36814 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36815 .addReg(TII->getGlobalBaseReg(F))
36816 .addImm(0)
36817 .addReg(0)
36818 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36819 MI.getOperand(3).getTargetFlags())
36820 .addReg(0);
36821 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36822 addDirectMem(MIB, X86::EAX);
36823 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36824 }
36825
36826 MI.eraseFromParent(); // The pseudo instruction is gone now.
36827 return BB;
36828}
36829
36830static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36831 switch (RPOpc) {
36832 case X86::INDIRECT_THUNK_CALL32:
36833 return X86::CALLpcrel32;
36834 case X86::INDIRECT_THUNK_CALL64:
36835 return X86::CALL64pcrel32;
36836 case X86::INDIRECT_THUNK_TCRETURN32:
36837 return X86::TCRETURNdi;
36838 case X86::INDIRECT_THUNK_TCRETURN64:
36839 return X86::TCRETURNdi64;
36840 }
36841 llvm_unreachable("not indirect thunk opcode");
36842}
36843
36844static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36845 Register Reg) {
36846 if (Subtarget.useRetpolineExternalThunk()) {
36847 // When using an external thunk for retpolines, we pick names that match the
36848 // names GCC happens to use as well. This helps simplify the implementation
36849 // of the thunks for kernels where they have no easy ability to create
36850 // aliases and are doing non-trivial configuration of the thunk's body. For
36851 // example, the Linux kernel will do boot-time hot patching of the thunk
36852 // bodies and cannot easily export aliases of these to loaded modules.
36853 //
36854 // Note that at any point in the future, we may need to change the semantics
36855 // of how we implement retpolines and at that time will likely change the
36856 // name of the called thunk. Essentially, there is no hard guarantee that
36857 // LLVM will generate calls to specific thunks, we merely make a best-effort
36858 // attempt to help out kernels and other systems where duplicating the
36859 // thunks is costly.
36860 switch (Reg.id()) {
36861 case X86::EAX:
36862 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36863 return "__x86_indirect_thunk_eax";
36864 case X86::ECX:
36865 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36866 return "__x86_indirect_thunk_ecx";
36867 case X86::EDX:
36868 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36869 return "__x86_indirect_thunk_edx";
36870 case X86::EDI:
36871 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36872 return "__x86_indirect_thunk_edi";
36873 case X86::R11:
36874 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36875 return "__x86_indirect_thunk_r11";
36876 }
36877 llvm_unreachable("unexpected reg for external indirect thunk");
36878 }
36879
36880 if (Subtarget.useRetpolineIndirectCalls() ||
36881 Subtarget.useRetpolineIndirectBranches()) {
36882 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36883 switch (Reg.id()) {
36884 case X86::EAX:
36885 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36886 return "__llvm_retpoline_eax";
36887 case X86::ECX:
36888 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36889 return "__llvm_retpoline_ecx";
36890 case X86::EDX:
36891 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36892 return "__llvm_retpoline_edx";
36893 case X86::EDI:
36894 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36895 return "__llvm_retpoline_edi";
36896 case X86::R11:
36897 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36898 return "__llvm_retpoline_r11";
36899 }
36900 llvm_unreachable("unexpected reg for retpoline");
36901 }
36902
36903 if (Subtarget.useLVIControlFlowIntegrity()) {
36904 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36905 return "__llvm_lvi_thunk_r11";
36906 }
36907 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36908}
36909
36911X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36912 MachineBasicBlock *BB) const {
36913 // Copy the virtual register into the R11 physical register and
36914 // call the retpoline thunk.
36915 const MIMetadata MIMD(MI);
36916 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36917 Register CalleeVReg = MI.getOperand(0).getReg();
36918 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36919
36920 // Find an available scratch register to hold the callee. On 64-bit, we can
36921 // just use R11, but we scan for uses anyway to ensure we don't generate
36922 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36923 // already a register use operand to the call to hold the callee. If none
36924 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36925 // register and ESI is the base pointer to realigned stack frames with VLAs.
36926 SmallVector<Register, 3> AvailableRegs;
36927 if (Subtarget.is64Bit())
36928 AvailableRegs.push_back(X86::R11);
36929 else
36930 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36931
36932 // Zero out any registers that are already used.
36933 for (const auto &MO : MI.operands()) {
36934 if (MO.isReg() && MO.isUse())
36935 llvm::replace(AvailableRegs, MO.getReg(), Register());
36936 }
36937
36938 // Choose the first remaining non-zero available register.
36939 Register AvailableReg;
36940 for (Register MaybeReg : AvailableRegs) {
36941 if (MaybeReg) {
36942 AvailableReg = MaybeReg;
36943 break;
36944 }
36945 }
36946 if (!AvailableReg)
36947 report_fatal_error("calling convention incompatible with retpoline, no "
36948 "available registers");
36949
36950 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36951
36952 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36953 .addReg(CalleeVReg);
36954 MI.getOperand(0).ChangeToES(Symbol);
36955 MI.setDesc(TII->get(Opc));
36957 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36958 return BB;
36959}
36960
36961/// SetJmp implies future control flow change upon calling the corresponding
36962/// LongJmp.
36963/// Instead of using the 'return' instruction, the long jump fixes the stack and
36964/// performs an indirect branch. To do so it uses the registers that were stored
36965/// in the jump buffer (when calling SetJmp).
36966/// In case the shadow stack is enabled we need to fix it as well, because some
36967/// return addresses will be skipped.
36968/// The function will save the SSP for future fixing in the function
36969/// emitLongJmpShadowStackFix.
36970/// \sa emitLongJmpShadowStackFix
36971/// \param [in] MI The temporary Machine Instruction for the builtin.
36972/// \param [in] MBB The Machine Basic Block that will be modified.
36973void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36974 MachineBasicBlock *MBB) const {
36975 const MIMetadata MIMD(MI);
36976 MachineFunction *MF = MBB->getParent();
36977 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36980
36981 // Memory Reference.
36982 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36983
36984 // Initialize a register with zero.
36985 MVT PVT = getPointerTy(MF->getDataLayout());
36986 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36987 Register ZReg = MRI.createVirtualRegister(PtrRC);
36988 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36989 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36990 .addDef(ZReg)
36991 .addReg(ZReg, RegState::Undef)
36992 .addReg(ZReg, RegState::Undef);
36993
36994 // Read the current SSP Register value to the zeroed register.
36995 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36996 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36997 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36998
36999 // Write the SSP register value to offset 3 in input memory buffer.
37000 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37001 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37002 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37003 const unsigned MemOpndSlot = 1;
37004 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37005 if (i == X86::AddrDisp)
37006 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37007 else
37008 MIB.add(MI.getOperand(MemOpndSlot + i));
37009 }
37010 MIB.addReg(SSPCopyReg);
37011 MIB.setMemRefs(MMOs);
37012}
37013
37015X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37016 MachineBasicBlock *MBB) const {
37017 const MIMetadata MIMD(MI);
37018 MachineFunction *MF = MBB->getParent();
37019 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37020 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37022
37023 const BasicBlock *BB = MBB->getBasicBlock();
37025
37026 // Memory Reference
37027 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37028
37029 unsigned MemOpndSlot = 0;
37030
37031 unsigned CurOp = 0;
37032
37033 Register DstReg = MI.getOperand(CurOp++).getReg();
37034 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37035 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37036 (void)TRI;
37037 Register mainDstReg = MRI.createVirtualRegister(RC);
37038 Register restoreDstReg = MRI.createVirtualRegister(RC);
37039
37040 MemOpndSlot = CurOp;
37041
37042 MVT PVT = getPointerTy(MF->getDataLayout());
37043 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37044 "Invalid Pointer Size!");
37045
37046 // For v = setjmp(buf), we generate
37047 //
37048 // thisMBB:
37049 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37050 // SjLjSetup restoreMBB
37051 //
37052 // mainMBB:
37053 // v_main = 0
37054 //
37055 // sinkMBB:
37056 // v = phi(main, restore)
37057 //
37058 // restoreMBB:
37059 // if base pointer being used, load it from frame
37060 // v_restore = 1
37061
37062 MachineBasicBlock *thisMBB = MBB;
37063 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37064 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37065 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37066 MF->insert(I, mainMBB);
37067 MF->insert(I, sinkMBB);
37068 MF->push_back(restoreMBB);
37069 restoreMBB->setMachineBlockAddressTaken();
37070
37072
37073 // Transfer the remainder of BB and its successor edges to sinkMBB.
37074 sinkMBB->splice(sinkMBB->begin(), MBB,
37075 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37077
37078 // thisMBB:
37079 unsigned PtrStoreOpc = 0;
37080 Register LabelReg;
37081 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37082 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37084
37085 // Prepare IP either in reg or imm.
37086 if (!UseImmLabel) {
37087 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37088 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37089 LabelReg = MRI.createVirtualRegister(PtrRC);
37090 if (Subtarget.is64Bit()) {
37091 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37092 .addReg(X86::RIP)
37093 .addImm(0)
37094 .addReg(0)
37095 .addMBB(restoreMBB)
37096 .addReg(0);
37097 } else {
37098 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37099 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37100 .addReg(XII->getGlobalBaseReg(MF))
37101 .addImm(0)
37102 .addReg(0)
37103 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37104 .addReg(0);
37105 }
37106 } else
37107 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37108 // Store IP
37109 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37110 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37111 if (i == X86::AddrDisp)
37112 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37113 else
37114 MIB.add(MI.getOperand(MemOpndSlot + i));
37115 }
37116 if (!UseImmLabel)
37117 MIB.addReg(LabelReg);
37118 else
37119 MIB.addMBB(restoreMBB);
37120 MIB.setMemRefs(MMOs);
37121
37122 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37123 emitSetJmpShadowStackFix(MI, thisMBB);
37124 }
37125
37126 // Setup
37127 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37128 .addMBB(restoreMBB);
37129
37130 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37131 MIB.addRegMask(RegInfo->getNoPreservedMask());
37132 thisMBB->addSuccessor(mainMBB);
37133 thisMBB->addSuccessor(restoreMBB);
37134
37135 // mainMBB:
37136 // EAX = 0
37137 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37138 mainMBB->addSuccessor(sinkMBB);
37139
37140 // sinkMBB:
37141 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37142 .addReg(mainDstReg)
37143 .addMBB(mainMBB)
37144 .addReg(restoreDstReg)
37145 .addMBB(restoreMBB);
37146
37147 // restoreMBB:
37148 if (RegInfo->hasBasePointer(*MF)) {
37149 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37151 X86FI->setRestoreBasePointer(MF);
37152 Register FramePtr = RegInfo->getFrameRegister(*MF);
37153 Register BasePtr = RegInfo->getBaseRegister();
37154 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37155 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37156 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37158 }
37159 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37160 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37161 restoreMBB->addSuccessor(sinkMBB);
37162
37163 MI.eraseFromParent();
37164 return sinkMBB;
37165}
37166
37167/// Fix the shadow stack using the previously saved SSP pointer.
37168/// \sa emitSetJmpShadowStackFix
37169/// \param [in] MI The temporary Machine Instruction for the builtin.
37170/// \param [in] MBB The Machine Basic Block that will be modified.
37171/// \return The sink MBB that will perform the future indirect branch.
37173X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37174 MachineBasicBlock *MBB) const {
37175 const MIMetadata MIMD(MI);
37176 MachineFunction *MF = MBB->getParent();
37177 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37179
37180 // Memory Reference
37181 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37182
37183 MVT PVT = getPointerTy(MF->getDataLayout());
37184 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37185
37186 // checkSspMBB:
37187 // xor vreg1, vreg1
37188 // rdssp vreg1
37189 // test vreg1, vreg1
37190 // je sinkMBB # Jump if Shadow Stack is not supported
37191 // fallMBB:
37192 // mov buf+24/12(%rip), vreg2
37193 // sub vreg1, vreg2
37194 // jbe sinkMBB # No need to fix the Shadow Stack
37195 // fixShadowMBB:
37196 // shr 3/2, vreg2
37197 // incssp vreg2 # fix the SSP according to the lower 8 bits
37198 // shr 8, vreg2
37199 // je sinkMBB
37200 // fixShadowLoopPrepareMBB:
37201 // shl vreg2
37202 // mov 128, vreg3
37203 // fixShadowLoopMBB:
37204 // incssp vreg3
37205 // dec vreg2
37206 // jne fixShadowLoopMBB # Iterate until you finish fixing
37207 // # the Shadow Stack
37208 // sinkMBB:
37209
37211 const BasicBlock *BB = MBB->getBasicBlock();
37212
37213 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37214 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37215 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37216 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37217 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37218 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37219 MF->insert(I, checkSspMBB);
37220 MF->insert(I, fallMBB);
37221 MF->insert(I, fixShadowMBB);
37222 MF->insert(I, fixShadowLoopPrepareMBB);
37223 MF->insert(I, fixShadowLoopMBB);
37224 MF->insert(I, sinkMBB);
37225
37226 // Transfer the remainder of BB and its successor edges to sinkMBB.
37227 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37228 MBB->end());
37230
37231 MBB->addSuccessor(checkSspMBB);
37232
37233 // Initialize a register with zero.
37234 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37235 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37236
37237 if (PVT == MVT::i64) {
37238 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37239 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37240 .addImm(0)
37241 .addReg(ZReg)
37242 .addImm(X86::sub_32bit);
37243 ZReg = TmpZReg;
37244 }
37245
37246 // Read the current SSP Register value to the zeroed register.
37247 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37248 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37249 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37250
37251 // Check whether the result of the SSP register is zero and jump directly
37252 // to the sink.
37253 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37254 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37255 .addReg(SSPCopyReg)
37256 .addReg(SSPCopyReg);
37257 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37258 .addMBB(sinkMBB)
37260 checkSspMBB->addSuccessor(sinkMBB);
37261 checkSspMBB->addSuccessor(fallMBB);
37262
37263 // Reload the previously saved SSP register value.
37264 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37265 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37266 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37268 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37269 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37270 const MachineOperand &MO = MI.getOperand(i);
37271 if (i == X86::AddrDisp)
37272 MIB.addDisp(MO, SPPOffset);
37273 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37274 // preserve kill flags.
37275 MIB.addReg(MO.getReg());
37276 else
37277 MIB.add(MO);
37278 }
37279 MIB.setMemRefs(MMOs);
37280
37281 // Subtract the current SSP from the previous SSP.
37282 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37283 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37284 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37285 .addReg(PrevSSPReg)
37286 .addReg(SSPCopyReg);
37287
37288 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37289 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37290 .addMBB(sinkMBB)
37292 fallMBB->addSuccessor(sinkMBB);
37293 fallMBB->addSuccessor(fixShadowMBB);
37294
37295 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37296 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37297 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37298 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37299 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37300 .addReg(SspSubReg)
37301 .addImm(Offset);
37302
37303 // Increase SSP when looking only on the lower 8 bits of the delta.
37304 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37305 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37306
37307 // Reset the lower 8 bits.
37308 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37309 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37310 .addReg(SspFirstShrReg)
37311 .addImm(8);
37312
37313 // Jump if the result of the shift is zero.
37314 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37315 .addMBB(sinkMBB)
37317 fixShadowMBB->addSuccessor(sinkMBB);
37318 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37319
37320 // Do a single shift left.
37321 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37322 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37323 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37324 .addReg(SspSecondShrReg)
37325 .addImm(1);
37326
37327 // Save the value 128 to a register (will be used next with incssp).
37328 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37329 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37330 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37331 .addImm(128);
37332 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37333
37334 // Since incssp only looks at the lower 8 bits, we might need to do several
37335 // iterations of incssp until we finish fixing the shadow stack.
37336 Register DecReg = MRI.createVirtualRegister(PtrRC);
37337 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37338 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37339 .addReg(SspAfterShlReg)
37340 .addMBB(fixShadowLoopPrepareMBB)
37341 .addReg(DecReg)
37342 .addMBB(fixShadowLoopMBB);
37343
37344 // Every iteration we increase the SSP by 128.
37345 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37346
37347 // Every iteration we decrement the counter by 1.
37348 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37349 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37350
37351 // Jump if the counter is not zero yet.
37352 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37353 .addMBB(fixShadowLoopMBB)
37355 fixShadowLoopMBB->addSuccessor(sinkMBB);
37356 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37357
37358 return sinkMBB;
37359}
37360
37362X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37363 MachineBasicBlock *MBB) const {
37364 const MIMetadata MIMD(MI);
37365 MachineFunction *MF = MBB->getParent();
37366 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37368
37369 // Memory Reference
37370 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37371
37372 MVT PVT = getPointerTy(MF->getDataLayout());
37373 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37374 "Invalid Pointer Size!");
37375
37376 const TargetRegisterClass *RC =
37377 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37378 Register Tmp = MRI.createVirtualRegister(RC);
37379 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37380 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37381 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37382 Register SP = RegInfo->getStackRegister();
37383
37385
37386 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37387 const int64_t SPOffset = 2 * PVT.getStoreSize();
37388
37389 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37390 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37391
37392 MachineBasicBlock *thisMBB = MBB;
37393
37394 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37395 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37396 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37397 }
37398
37399 // Reload FP
37400 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37401 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37402 const MachineOperand &MO = MI.getOperand(i);
37403 if (MO.isReg()) // Don't add the whole operand, we don't want to
37404 // preserve kill flags.
37405 MIB.addReg(MO.getReg());
37406 else
37407 MIB.add(MO);
37408 }
37409 MIB.setMemRefs(MMOs);
37411
37412 // Reload IP
37413 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37414 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37415 const MachineOperand &MO = MI.getOperand(i);
37416 if (i == X86::AddrDisp)
37417 MIB.addDisp(MO, LabelOffset);
37418 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37419 // preserve kill flags.
37420 MIB.addReg(MO.getReg());
37421 else
37422 MIB.add(MO);
37423 }
37424 MIB.setMemRefs(MMOs);
37425
37426 // Reload SP
37427 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37428 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37429 if (i == X86::AddrDisp)
37430 MIB.addDisp(MI.getOperand(i), SPOffset);
37431 else
37432 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37433 // the last instruction of the expansion.
37434 }
37435 MIB.setMemRefs(MMOs);
37437
37438 // Jump
37439 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37440
37441 MI.eraseFromParent();
37442 return thisMBB;
37443}
37444
37445void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37447 MachineBasicBlock *DispatchBB,
37448 int FI) const {
37449 const MIMetadata MIMD(MI);
37450 MachineFunction *MF = MBB->getParent();
37452 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37453
37454 MVT PVT = getPointerTy(MF->getDataLayout());
37455 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37456
37457 unsigned Op = 0;
37458 Register VR;
37459
37460 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37462
37463 if (UseImmLabel) {
37464 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37465 } else {
37466 const TargetRegisterClass *TRC =
37467 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37468 VR = MRI->createVirtualRegister(TRC);
37469 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37470
37471 if (Subtarget.is64Bit())
37472 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37473 .addReg(X86::RIP)
37474 .addImm(1)
37475 .addReg(0)
37476 .addMBB(DispatchBB)
37477 .addReg(0);
37478 else
37479 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37480 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37481 .addImm(1)
37482 .addReg(0)
37483 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37484 .addReg(0);
37485 }
37486
37487 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37488 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37489 if (UseImmLabel)
37490 MIB.addMBB(DispatchBB);
37491 else
37492 MIB.addReg(VR);
37493}
37494
37496X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37497 MachineBasicBlock *BB) const {
37498 const MIMetadata MIMD(MI);
37499 MachineFunction *MF = BB->getParent();
37501 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37502 int FI = MF->getFrameInfo().getFunctionContextIndex();
37503
37504 // Get a mapping of the call site numbers to all of the landing pads they're
37505 // associated with.
37507 unsigned MaxCSNum = 0;
37508 for (auto &MBB : *MF) {
37509 if (!MBB.isEHPad())
37510 continue;
37511
37512 MCSymbol *Sym = nullptr;
37513 for (const auto &MI : MBB) {
37514 if (MI.isDebugInstr())
37515 continue;
37516
37517 assert(MI.isEHLabel() && "expected EH_LABEL");
37518 Sym = MI.getOperand(0).getMCSymbol();
37519 break;
37520 }
37521
37522 if (!MF->hasCallSiteLandingPad(Sym))
37523 continue;
37524
37525 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37526 CallSiteNumToLPad[CSI].push_back(&MBB);
37527 MaxCSNum = std::max(MaxCSNum, CSI);
37528 }
37529 }
37530
37531 // Get an ordered list of the machine basic blocks for the jump table.
37532 std::vector<MachineBasicBlock *> LPadList;
37534 LPadList.reserve(CallSiteNumToLPad.size());
37535
37536 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37537 for (auto &LP : CallSiteNumToLPad[CSI]) {
37538 LPadList.push_back(LP);
37539 InvokeBBs.insert_range(LP->predecessors());
37540 }
37541 }
37542
37543 assert(!LPadList.empty() &&
37544 "No landing pad destinations for the dispatch jump table!");
37545
37546 // Create the MBBs for the dispatch code.
37547
37548 // Shove the dispatch's address into the return slot in the function context.
37549 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37550 DispatchBB->setIsEHPad(true);
37551
37552 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37553 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37554 DispatchBB->addSuccessor(TrapBB);
37555
37556 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37557 DispatchBB->addSuccessor(DispContBB);
37558
37559 // Insert MBBs.
37560 MF->push_back(DispatchBB);
37561 MF->push_back(DispContBB);
37562 MF->push_back(TrapBB);
37563
37564 // Insert code into the entry block that creates and registers the function
37565 // context.
37566 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37567
37568 // Create the jump table and associated information
37569 unsigned JTE = getJumpTableEncoding();
37570 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37571 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37572
37573 const X86RegisterInfo &RI = TII->getRegisterInfo();
37574 // Add a register mask with no preserved registers. This results in all
37575 // registers being marked as clobbered.
37576 if (RI.hasBasePointer(*MF)) {
37577 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37578 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37579 MFI->setRestoreBasePointer(MF);
37580
37581 Register FP = RI.getFrameRegister(*MF);
37582 Register BP = RI.getBaseRegister();
37583 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37584 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37587 } else {
37588 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37590 }
37591
37592 // IReg is used as an index in a memory operand and therefore can't be SP
37593 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37594 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37595 Subtarget.is64Bit() ? 8 : 4);
37596 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37597 .addReg(IReg)
37598 .addImm(LPadList.size());
37599 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37600 .addMBB(TrapBB)
37602
37603 if (Subtarget.is64Bit()) {
37604 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37605 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37606
37607 // leaq .LJTI0_0(%rip), BReg
37608 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37609 .addReg(X86::RIP)
37610 .addImm(1)
37611 .addReg(0)
37612 .addJumpTableIndex(MJTI)
37613 .addReg(0);
37614 // movzx IReg64, IReg
37615 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37616 .addImm(0)
37617 .addReg(IReg)
37618 .addImm(X86::sub_32bit);
37619
37620 switch (JTE) {
37622 // jmpq *(BReg,IReg64,8)
37623 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37624 .addReg(BReg)
37625 .addImm(8)
37626 .addReg(IReg64)
37627 .addImm(0)
37628 .addReg(0);
37629 break;
37631 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37632 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37633 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37634
37635 // movl (BReg,IReg64,4), OReg
37636 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37637 .addReg(BReg)
37638 .addImm(4)
37639 .addReg(IReg64)
37640 .addImm(0)
37641 .addReg(0);
37642 // movsx OReg64, OReg
37643 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37644 .addReg(OReg);
37645 // addq BReg, OReg64, TReg
37646 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37647 .addReg(OReg64)
37648 .addReg(BReg);
37649 // jmpq *TReg
37650 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37651 break;
37652 }
37653 default:
37654 llvm_unreachable("Unexpected jump table encoding");
37655 }
37656 } else {
37657 // jmpl *.LJTI0_0(,IReg,4)
37658 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37659 .addReg(0)
37660 .addImm(4)
37661 .addReg(IReg)
37662 .addJumpTableIndex(MJTI)
37663 .addReg(0);
37664 }
37665
37666 // Add the jump table entries as successors to the MBB.
37668 for (auto &LP : LPadList)
37669 if (SeenMBBs.insert(LP).second)
37670 DispContBB->addSuccessor(LP);
37671
37672 // N.B. the order the invoke BBs are processed in doesn't matter here.
37674 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37675 for (MachineBasicBlock *MBB : InvokeBBs) {
37676 // Remove the landing pad successor from the invoke block and replace it
37677 // with the new dispatch block.
37678 // Keep a copy of Successors since it's modified inside the loop.
37680 MBB->succ_rend());
37681 // FIXME: Avoid quadratic complexity.
37682 for (auto *MBBS : Successors) {
37683 if (MBBS->isEHPad()) {
37684 MBB->removeSuccessor(MBBS);
37685 MBBLPads.push_back(MBBS);
37686 }
37687 }
37688
37689 MBB->addSuccessor(DispatchBB);
37690
37691 // Find the invoke call and mark all of the callee-saved registers as
37692 // 'implicit defined' so that they're spilled. This prevents code from
37693 // moving instructions to before the EH block, where they will never be
37694 // executed.
37695 for (auto &II : reverse(*MBB)) {
37696 if (!II.isCall())
37697 continue;
37698
37699 DenseSet<Register> DefRegs;
37700 for (auto &MOp : II.operands())
37701 if (MOp.isReg())
37702 DefRegs.insert(MOp.getReg());
37703
37704 MachineInstrBuilder MIB(*MF, &II);
37705 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37706 Register Reg = SavedRegs[RegIdx];
37707 if (!DefRegs.contains(Reg))
37709 }
37710
37711 break;
37712 }
37713 }
37714
37715 // Mark all former landing pads as non-landing pads. The dispatch is the only
37716 // landing pad now.
37717 for (auto &LP : MBBLPads)
37718 LP->setIsEHPad(false);
37719
37720 // The instruction is gone now.
37721 MI.eraseFromParent();
37722 return BB;
37723}
37724
37726X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37727 MachineBasicBlock *BB) const {
37728 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37729 // calls may require proper stack alignment.
37730 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37731 const MIMetadata MIMD(MI);
37732 MachineFunction &MF = *BB->getParent();
37733
37734 // Emit CALLSEQ_START right before the instruction.
37735 MF.getFrameInfo().setAdjustsStack(true);
37736 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37737 MachineInstrBuilder CallseqStart =
37738 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37739 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37740
37741 // Emit CALLSEQ_END right after the instruction.
37742 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37743 MachineInstrBuilder CallseqEnd =
37744 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37745 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37746
37747 return BB;
37748}
37749
37752 MachineBasicBlock *BB) const {
37753 MachineFunction *MF = BB->getParent();
37754 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37755 const MIMetadata MIMD(MI);
37756
37757 auto TMMImmToTMMReg = [](unsigned Imm) {
37758 assert (Imm < 8 && "Illegal tmm index");
37759 return X86::TMM0 + Imm;
37760 };
37761 auto TMMImmToTMMPair = [](unsigned Imm) {
37762 assert(Imm < 8 && "Illegal tmm pair index.");
37763 return X86::TMM0_TMM1 + Imm / 2;
37764 };
37765 switch (MI.getOpcode()) {
37766 default:
37767 llvm_unreachable("Unexpected instr type to insert");
37768 case X86::INDIRECT_THUNK_CALL32:
37769 case X86::INDIRECT_THUNK_CALL64:
37770 case X86::INDIRECT_THUNK_TCRETURN32:
37771 case X86::INDIRECT_THUNK_TCRETURN64:
37772 return EmitLoweredIndirectThunk(MI, BB);
37773 case X86::CATCHRET:
37774 return EmitLoweredCatchRet(MI, BB);
37775 case X86::SEG_ALLOCA_32:
37776 case X86::SEG_ALLOCA_64:
37777 return EmitLoweredSegAlloca(MI, BB);
37778 case X86::PROBED_ALLOCA_32:
37779 case X86::PROBED_ALLOCA_64:
37780 return EmitLoweredProbedAlloca(MI, BB);
37781 case X86::TLSCall_32:
37782 case X86::TLSCall_64:
37783 return EmitLoweredTLSCall(MI, BB);
37784 case X86::CMOV_FR16:
37785 case X86::CMOV_FR16X:
37786 case X86::CMOV_FR32:
37787 case X86::CMOV_FR32X:
37788 case X86::CMOV_FR64:
37789 case X86::CMOV_FR64X:
37790 case X86::CMOV_GR8:
37791 case X86::CMOV_GR16:
37792 case X86::CMOV_GR32:
37793 case X86::CMOV_RFP32:
37794 case X86::CMOV_RFP64:
37795 case X86::CMOV_RFP80:
37796 case X86::CMOV_VR64:
37797 case X86::CMOV_VR128:
37798 case X86::CMOV_VR128X:
37799 case X86::CMOV_VR256:
37800 case X86::CMOV_VR256X:
37801 case X86::CMOV_VR512:
37802 case X86::CMOV_VK1:
37803 case X86::CMOV_VK2:
37804 case X86::CMOV_VK4:
37805 case X86::CMOV_VK8:
37806 case X86::CMOV_VK16:
37807 case X86::CMOV_VK32:
37808 case X86::CMOV_VK64:
37809 return EmitLoweredSelect(MI, BB);
37810
37811 case X86::FP80_ADDr:
37812 case X86::FP80_ADDm32: {
37813 // Change the floating point control register to use double extended
37814 // precision when performing the addition.
37815 int OrigCWFrameIdx =
37816 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37817 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37818 OrigCWFrameIdx);
37819
37820 // Load the old value of the control word...
37821 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37822 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37823 OrigCWFrameIdx);
37824
37825 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37826 // precision.
37827 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37828 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37829 .addReg(OldCW, RegState::Kill)
37830 .addImm(0x300);
37831
37832 // Extract to 16 bits.
37833 Register NewCW16 =
37834 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37835 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37836 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37837
37838 // Prepare memory for FLDCW.
37839 int NewCWFrameIdx =
37840 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37841 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37842 NewCWFrameIdx)
37843 .addReg(NewCW16, RegState::Kill);
37844
37845 // Reload the modified control word now...
37846 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37847 NewCWFrameIdx);
37848
37849 // Do the addition.
37850 if (MI.getOpcode() == X86::FP80_ADDr) {
37851 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37852 .add(MI.getOperand(0))
37853 .add(MI.getOperand(1))
37854 .add(MI.getOperand(2));
37855 } else {
37856 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37857 .add(MI.getOperand(0))
37858 .add(MI.getOperand(1))
37859 .add(MI.getOperand(2))
37860 .add(MI.getOperand(3))
37861 .add(MI.getOperand(4))
37862 .add(MI.getOperand(5))
37863 .add(MI.getOperand(6));
37864 }
37865
37866 // Reload the original control word now.
37867 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37868 OrigCWFrameIdx);
37869
37870 MI.eraseFromParent(); // The pseudo instruction is gone now.
37871 return BB;
37872 }
37873
37874 case X86::FP32_TO_INT16_IN_MEM:
37875 case X86::FP32_TO_INT32_IN_MEM:
37876 case X86::FP32_TO_INT64_IN_MEM:
37877 case X86::FP64_TO_INT16_IN_MEM:
37878 case X86::FP64_TO_INT32_IN_MEM:
37879 case X86::FP64_TO_INT64_IN_MEM:
37880 case X86::FP80_TO_INT16_IN_MEM:
37881 case X86::FP80_TO_INT32_IN_MEM:
37882 case X86::FP80_TO_INT64_IN_MEM: {
37883 // Change the floating point control register to use "round towards zero"
37884 // mode when truncating to an integer value.
37885 int OrigCWFrameIdx =
37886 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37887 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37888 OrigCWFrameIdx);
37889
37890 // Load the old value of the control word...
37891 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37892 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37893 OrigCWFrameIdx);
37894
37895 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37896 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37897 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37898 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37899
37900 // Extract to 16 bits.
37901 Register NewCW16 =
37902 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37903 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37904 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37905
37906 // Prepare memory for FLDCW.
37907 int NewCWFrameIdx =
37908 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37909 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37910 NewCWFrameIdx)
37911 .addReg(NewCW16, RegState::Kill);
37912
37913 // Reload the modified control word now...
37914 addFrameReference(BuildMI(*BB, MI, MIMD,
37915 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37916
37917 // Get the X86 opcode to use.
37918 unsigned Opc;
37919 switch (MI.getOpcode()) {
37920 // clang-format off
37921 default: llvm_unreachable("illegal opcode!");
37922 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37923 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37924 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37925 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37926 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37927 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37928 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37929 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37930 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37931 // clang-format on
37932 }
37933
37935 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37936 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37937
37938 // Reload the original control word now.
37939 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37940 OrigCWFrameIdx);
37941
37942 MI.eraseFromParent(); // The pseudo instruction is gone now.
37943 return BB;
37944 }
37945
37946 // xbegin
37947 case X86::XBEGIN:
37948 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37949
37950 case X86::VAARG_64:
37951 case X86::VAARG_X32:
37952 return EmitVAARGWithCustomInserter(MI, BB);
37953
37954 case X86::EH_SjLj_SetJmp32:
37955 case X86::EH_SjLj_SetJmp64:
37956 return emitEHSjLjSetJmp(MI, BB);
37957
37958 case X86::EH_SjLj_LongJmp32:
37959 case X86::EH_SjLj_LongJmp64:
37960 return emitEHSjLjLongJmp(MI, BB);
37961
37962 case X86::Int_eh_sjlj_setup_dispatch:
37963 return EmitSjLjDispatchBlock(MI, BB);
37964
37965 case TargetOpcode::STATEPOINT:
37966 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37967 // this point in the process. We diverge later.
37968 return emitPatchPoint(MI, BB);
37969
37970 case TargetOpcode::STACKMAP:
37971 case TargetOpcode::PATCHPOINT:
37972 return emitPatchPoint(MI, BB);
37973
37974 case TargetOpcode::PATCHABLE_EVENT_CALL:
37975 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37976 return emitPatchableEventCall(MI, BB);
37977
37978 case X86::LCMPXCHG8B: {
37979 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37980 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37981 // requires a memory operand. If it happens that current architecture is
37982 // i686 and for current function we need a base pointer
37983 // - which is ESI for i686 - register allocator would not be able to
37984 // allocate registers for an address in form of X(%reg, %reg, Y)
37985 // - there never would be enough unreserved registers during regalloc
37986 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37987 // We are giving a hand to register allocator by precomputing the address in
37988 // a new vreg using LEA.
37989
37990 // If it is not i686 or there is no base pointer - nothing to do here.
37991 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37992 return BB;
37993
37994 // Even though this code does not necessarily needs the base pointer to
37995 // be ESI, we check for that. The reason: if this assert fails, there are
37996 // some changes happened in the compiler base pointer handling, which most
37997 // probably have to be addressed somehow here.
37998 assert(TRI->getBaseRegister() == X86::ESI &&
37999 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38000 "base pointer in mind");
38001
38003 MVT SPTy = getPointerTy(MF->getDataLayout());
38004 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38005 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38006
38008 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38009 // does not use index register.
38010 if (AM.IndexReg == X86::NoRegister)
38011 return BB;
38012
38013 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38014 // four operand definitions that are E[ABCD] registers. We skip them and
38015 // then insert the LEA.
38016 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38017 while (RMBBI != BB->rend() &&
38018 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38019 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38020 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38021 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38022 ++RMBBI;
38023 }
38026 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38027
38028 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38029
38030 return BB;
38031 }
38032 case X86::LCMPXCHG16B_NO_RBX: {
38033 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38034 Register BasePtr = TRI->getBaseRegister();
38035 if (TRI->hasBasePointer(*MF) &&
38036 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38037 if (!BB->isLiveIn(BasePtr))
38038 BB->addLiveIn(BasePtr);
38039 // Save RBX into a virtual register.
38040 Register SaveRBX =
38041 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38042 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38043 .addReg(X86::RBX);
38044 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38046 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38047 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38048 MIB.add(MI.getOperand(Idx));
38049 MIB.add(MI.getOperand(X86::AddrNumOperands));
38050 MIB.addReg(SaveRBX);
38051 } else {
38052 // Simple case, just copy the virtual register to RBX.
38053 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38054 .add(MI.getOperand(X86::AddrNumOperands));
38056 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38057 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38058 MIB.add(MI.getOperand(Idx));
38059 }
38060 MI.eraseFromParent();
38061 return BB;
38062 }
38063 case X86::MWAITX: {
38064 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38065 Register BasePtr = TRI->getBaseRegister();
38066 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38067 // If no need to save the base pointer, we generate MWAITXrrr,
38068 // else we generate pseudo MWAITX_SAVE_RBX.
38069 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38070 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38071 .addReg(MI.getOperand(0).getReg());
38072 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38073 .addReg(MI.getOperand(1).getReg());
38074 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38075 .addReg(MI.getOperand(2).getReg());
38076 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38077 MI.eraseFromParent();
38078 } else {
38079 if (!BB->isLiveIn(BasePtr)) {
38080 BB->addLiveIn(BasePtr);
38081 }
38082 // Parameters can be copied into ECX and EAX but not EBX yet.
38083 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38084 .addReg(MI.getOperand(0).getReg());
38085 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38086 .addReg(MI.getOperand(1).getReg());
38087 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38088 // Save RBX into a virtual register.
38089 Register SaveRBX =
38090 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38091 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38092 .addReg(X86::RBX);
38093 // Generate mwaitx pseudo.
38094 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38095 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38096 .addDef(Dst) // Destination tied in with SaveRBX.
38097 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38098 .addUse(SaveRBX); // Save of base pointer.
38099 MI.eraseFromParent();
38100 }
38101 return BB;
38102 }
38103 case TargetOpcode::PREALLOCATED_SETUP: {
38104 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38105 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38106 MFI->setHasPreallocatedCall(true);
38107 int64_t PreallocatedId = MI.getOperand(0).getImm();
38108 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38109 assert(StackAdjustment != 0 && "0 stack adjustment");
38110 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38111 << StackAdjustment << "\n");
38112 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38113 .addReg(X86::ESP)
38114 .addImm(StackAdjustment);
38115 MI.eraseFromParent();
38116 return BB;
38117 }
38118 case TargetOpcode::PREALLOCATED_ARG: {
38119 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38120 int64_t PreallocatedId = MI.getOperand(1).getImm();
38121 int64_t ArgIdx = MI.getOperand(2).getImm();
38122 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38123 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38124 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38125 << ", arg offset " << ArgOffset << "\n");
38126 // stack pointer + offset
38127 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38128 MI.getOperand(0).getReg()),
38129 X86::ESP, false, ArgOffset);
38130 MI.eraseFromParent();
38131 return BB;
38132 }
38133 case X86::PTDPBSSD:
38134 case X86::PTDPBSUD:
38135 case X86::PTDPBUSD:
38136 case X86::PTDPBUUD:
38137 case X86::PTDPBF16PS:
38138 case X86::PTDPFP16PS:
38139 case X86::PTCMMIMFP16PS:
38140 case X86::PTCMMRLFP16PS:
38141 case X86::PTDPBF8PS:
38142 case X86::PTDPBHF8PS:
38143 case X86::PTDPHBF8PS:
38144 case X86::PTDPHF8PS:
38145 case X86::PTTDPBF16PS:
38146 case X86::PTTDPFP16PS:
38147 case X86::PTTCMMIMFP16PS:
38148 case X86::PTTCMMRLFP16PS:
38149 case X86::PTCONJTCMMIMFP16PS:
38150 case X86::PTMMULTF32PS:
38151 case X86::PTTMMULTF32PS: {
38152 unsigned Opc;
38153 switch (MI.getOpcode()) {
38154 default: llvm_unreachable("illegal opcode!");
38155 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38156 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38157 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38158 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38159 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38160 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38161 case X86::PTCMMIMFP16PS:
38162 Opc = X86::TCMMIMFP16PS;
38163 break;
38164 case X86::PTCMMRLFP16PS:
38165 Opc = X86::TCMMRLFP16PS;
38166 break;
38167 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38168 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38169 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38170 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38171 case X86::PTTDPBF16PS:
38172 Opc = X86::TTDPBF16PS;
38173 break;
38174 case X86::PTTDPFP16PS:
38175 Opc = X86::TTDPFP16PS;
38176 break;
38177 case X86::PTTCMMIMFP16PS:
38178 Opc = X86::TTCMMIMFP16PS;
38179 break;
38180 case X86::PTTCMMRLFP16PS:
38181 Opc = X86::TTCMMRLFP16PS;
38182 break;
38183 case X86::PTCONJTCMMIMFP16PS:
38184 Opc = X86::TCONJTCMMIMFP16PS;
38185 break;
38186 case X86::PTMMULTF32PS:
38187 Opc = X86::TMMULTF32PS;
38188 break;
38189 case X86::PTTMMULTF32PS:
38190 Opc = X86::TTMMULTF32PS;
38191 break;
38192 }
38193
38194 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38195 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38196 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38197 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38198 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38199
38200 MI.eraseFromParent(); // The pseudo is gone now.
38201 return BB;
38202 }
38203 case X86::PTILEZERO: {
38204 unsigned Imm = MI.getOperand(0).getImm();
38205 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38206 MI.eraseFromParent(); // The pseudo is gone now.
38207 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38209 return BB;
38210 }
38211 case X86::PTILEZEROV: {
38212 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38214 return BB;
38215 }
38216 case X86::PTILELOADDRS:
38217 case X86::PTILELOADDRST1:
38218 case X86::PTILELOADD:
38219 case X86::PTILELOADDT1:
38220 case X86::PTILESTORED: {
38221 unsigned Opc;
38222 switch (MI.getOpcode()) {
38223 default: llvm_unreachable("illegal opcode!");
38224#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38225 case X86::PTILELOADD:
38226 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38227 break;
38228 case X86::PTILELOADDT1:
38229 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38230 break;
38231 case X86::PTILESTORED:
38232 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38233 break;
38234 case X86::PTILELOADDRS:
38235 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38236 break;
38237 case X86::PTILELOADDRST1:
38238 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38239 break;
38240 }
38241#undef GET_EGPR_IF_ENABLED
38242
38243 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38244 unsigned CurOp = 0;
38245 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38246 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38248
38249 MIB.add(MI.getOperand(CurOp++)); // base
38250 MIB.add(MI.getOperand(CurOp++)); // scale
38251 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38252 MIB.add(MI.getOperand(CurOp++)); // displacement
38253 MIB.add(MI.getOperand(CurOp++)); // segment
38254
38255 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38256 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38258
38259 MI.eraseFromParent(); // The pseudo is gone now.
38260 return BB;
38261 }
38262 case X86::PT2RPNTLVWZ0:
38263 case X86::PT2RPNTLVWZ0T1:
38264 case X86::PT2RPNTLVWZ1:
38265 case X86::PT2RPNTLVWZ1T1:
38266 case X86::PT2RPNTLVWZ0RS:
38267 case X86::PT2RPNTLVWZ0RST1:
38268 case X86::PT2RPNTLVWZ1RS:
38269 case X86::PT2RPNTLVWZ1RST1: {
38270 const DebugLoc &DL = MI.getDebugLoc();
38271 unsigned Opc;
38272#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38273 switch (MI.getOpcode()) {
38274 default:
38275 llvm_unreachable("Unexpected instruction!");
38276 case X86::PT2RPNTLVWZ0:
38277 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38278 break;
38279 case X86::PT2RPNTLVWZ0T1:
38280 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38281 break;
38282 case X86::PT2RPNTLVWZ1:
38283 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38284 break;
38285 case X86::PT2RPNTLVWZ1T1:
38286 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38287 break;
38288 case X86::PT2RPNTLVWZ0RS:
38289 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38290 break;
38291 case X86::PT2RPNTLVWZ0RST1:
38292 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38293 break;
38294 case X86::PT2RPNTLVWZ1RS:
38295 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38296 break;
38297 case X86::PT2RPNTLVWZ1RST1:
38298 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38299 break;
38300 }
38301#undef GET_EGPR_IF_ENABLED
38302 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38303 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38304
38305 MIB.add(MI.getOperand(1)); // base
38306 MIB.add(MI.getOperand(2)); // scale
38307 MIB.add(MI.getOperand(3)); // index
38308 MIB.add(MI.getOperand(4)); // displacement
38309 MIB.add(MI.getOperand(5)); // segment
38310 MI.eraseFromParent(); // The pseudo is gone now.
38311 return BB;
38312 }
38313 case X86::PTTRANSPOSED:
38314 case X86::PTCONJTFP16: {
38315 const DebugLoc &DL = MI.getDebugLoc();
38316 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38317 : X86::TCONJTFP16;
38318
38319 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38320 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38321 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38322
38323 MI.eraseFromParent(); // The pseudo is gone now.
38324 return BB;
38325 }
38326 case X86::PTCVTROWPS2BF16Hrri:
38327 case X86::PTCVTROWPS2BF16Lrri:
38328 case X86::PTCVTROWPS2PHHrri:
38329 case X86::PTCVTROWPS2PHLrri:
38330 case X86::PTCVTROWD2PSrri:
38331 case X86::PTILEMOVROWrri: {
38332 const DebugLoc &DL = MI.getDebugLoc();
38333 unsigned Opc;
38334 switch (MI.getOpcode()) {
38335 default:
38336 llvm_unreachable("Unexpected instruction!");
38337 case X86::PTCVTROWD2PSrri:
38338 Opc = X86::TCVTROWD2PSrri;
38339 break;
38340 case X86::PTCVTROWPS2BF16Hrri:
38341 Opc = X86::TCVTROWPS2BF16Hrri;
38342 break;
38343 case X86::PTCVTROWPS2PHHrri:
38344 Opc = X86::TCVTROWPS2PHHrri;
38345 break;
38346 case X86::PTCVTROWPS2BF16Lrri:
38347 Opc = X86::TCVTROWPS2BF16Lrri;
38348 break;
38349 case X86::PTCVTROWPS2PHLrri:
38350 Opc = X86::TCVTROWPS2PHLrri;
38351 break;
38352 case X86::PTILEMOVROWrri:
38353 Opc = X86::TILEMOVROWrri;
38354 break;
38355 }
38356 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38357 MIB.add(MI.getOperand(0));
38358 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38359 MIB.addImm(MI.getOperand(2).getImm());
38360
38361 MI.eraseFromParent(); // The pseudo is gone now.
38362 return BB;
38363 }
38364 case X86::PTCVTROWPS2BF16Hrre:
38365 case X86::PTCVTROWPS2BF16Lrre:
38366 case X86::PTCVTROWPS2PHHrre:
38367 case X86::PTCVTROWPS2PHLrre:
38368 case X86::PTCVTROWD2PSrre:
38369 case X86::PTILEMOVROWrre: {
38370 const DebugLoc &DL = MI.getDebugLoc();
38371 unsigned Opc;
38372 switch (MI.getOpcode()) {
38373 default:
38374 llvm_unreachable("Unexpected instruction!");
38375 case X86::PTCVTROWD2PSrre:
38376 Opc = X86::TCVTROWD2PSrre;
38377 break;
38378 case X86::PTCVTROWPS2BF16Hrre:
38379 Opc = X86::TCVTROWPS2BF16Hrre;
38380 break;
38381 case X86::PTCVTROWPS2BF16Lrre:
38382 Opc = X86::TCVTROWPS2BF16Lrre;
38383 break;
38384 case X86::PTCVTROWPS2PHHrre:
38385 Opc = X86::TCVTROWPS2PHHrre;
38386 break;
38387 case X86::PTCVTROWPS2PHLrre:
38388 Opc = X86::TCVTROWPS2PHLrre;
38389 break;
38390 case X86::PTILEMOVROWrre:
38391 Opc = X86::TILEMOVROWrre;
38392 break;
38393 }
38394 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38395 MIB.add(MI.getOperand(0));
38396 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38397 MIB.add(MI.getOperand(2));
38398
38399 MI.eraseFromParent(); // The pseudo is gone now.
38400 return BB;
38401 }
38402 }
38403}
38404
38405//===----------------------------------------------------------------------===//
38406// X86 Optimization Hooks
38407//===----------------------------------------------------------------------===//
38408
38409bool
38411 const APInt &DemandedBits,
38412 const APInt &DemandedElts,
38413 TargetLoweringOpt &TLO) const {
38414 EVT VT = Op.getValueType();
38415 unsigned Opcode = Op.getOpcode();
38416 unsigned EltSize = VT.getScalarSizeInBits();
38417
38418 if (VT.isVector()) {
38419 // If the constant is only all signbits in the active bits, then we should
38420 // extend it to the entire constant to allow it act as a boolean constant
38421 // vector.
38422 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38423 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38424 return false;
38425 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38426 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38427 continue;
38428 const APInt &Val = V.getConstantOperandAPInt(i);
38429 if (Val.getBitWidth() > Val.getNumSignBits() &&
38430 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38431 return true;
38432 }
38433 return false;
38434 };
38435 // For vectors - if we have a constant, then try to sign extend.
38436 // TODO: Handle AND cases.
38437 unsigned ActiveBits = DemandedBits.getActiveBits();
38438 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38439 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38440 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38441 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38442 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38444 SDValue NewC =
38446 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38447 SDValue NewOp =
38448 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38449 return TLO.CombineTo(Op, NewOp);
38450 }
38451 return false;
38452 }
38453
38454 // Only optimize Ands to prevent shrinking a constant that could be
38455 // matched by movzx.
38456 if (Opcode != ISD::AND)
38457 return false;
38458
38459 // Make sure the RHS really is a constant.
38460 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38461 if (!C)
38462 return false;
38463
38464 const APInt &Mask = C->getAPIntValue();
38465
38466 // Clear all non-demanded bits initially.
38467 APInt ShrunkMask = Mask & DemandedBits;
38468
38469 // Find the width of the shrunk mask.
38470 unsigned Width = ShrunkMask.getActiveBits();
38471
38472 // If the mask is all 0s there's nothing to do here.
38473 if (Width == 0)
38474 return false;
38475
38476 // Find the next power of 2 width, rounding up to a byte.
38477 Width = llvm::bit_ceil(std::max(Width, 8U));
38478 // Truncate the width to size to handle illegal types.
38479 Width = std::min(Width, EltSize);
38480
38481 // Calculate a possible zero extend mask for this constant.
38482 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38483
38484 // If we aren't changing the mask, just return true to keep it and prevent
38485 // the caller from optimizing.
38486 if (ZeroExtendMask == Mask)
38487 return true;
38488
38489 // Make sure the new mask can be represented by a combination of mask bits
38490 // and non-demanded bits.
38491 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38492 return false;
38493
38494 // Replace the constant with the zero extend mask.
38495 SDLoc DL(Op);
38496 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38497 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38498 return TLO.CombineTo(Op, NewOp);
38499}
38500
38502 KnownBits &Known,
38503 const APInt &DemandedElts,
38504 const SelectionDAG &DAG, unsigned Depth) {
38505 KnownBits Known2;
38506 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38507 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38508 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38509 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38510 Known = KnownBits::abdu(Known, Known2).zext(16);
38511 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38512 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38513 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38514 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38515 Known = Known.zext(64);
38516}
38517
38519 KnownBits &Known,
38520 const APInt &DemandedElts,
38521 const SelectionDAG &DAG,
38522 unsigned Depth) {
38523 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38524
38525 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38526 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38527 APInt DemandedLoElts =
38528 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38529 APInt DemandedHiElts =
38530 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38531 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38532 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38533 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38534 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38535 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38536 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38537 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38538}
38539
38541 KnownBits &Known,
38542 const APInt &DemandedElts,
38543 const SelectionDAG &DAG,
38544 unsigned Depth) {
38545 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38546
38547 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38548 // pairs.
38549 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38550 APInt DemandedLoElts =
38551 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38552 APInt DemandedHiElts =
38553 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38554 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38555 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38556 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38557 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38558 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38559 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38560 Known = KnownBits::sadd_sat(Lo, Hi);
38561}
38562
38564 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38565 const SelectionDAG &DAG,
38566 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38567 KnownBitsFunc) {
38568 APInt DemandedEltsLHS, DemandedEltsRHS;
38569 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38570 DemandedElts, DemandedEltsLHS,
38571 DemandedEltsRHS);
38572
38573 const auto ComputeForSingleOpFunc =
38574 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38575 return KnownBitsFunc(
38576 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38577 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38578 };
38579
38580 if (DemandedEltsRHS.isZero())
38581 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38582 if (DemandedEltsLHS.isZero())
38583 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38584
38585 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38586 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38587}
38588
38590 KnownBits &Known,
38591 const APInt &DemandedElts,
38592 const SelectionDAG &DAG,
38593 unsigned Depth) const {
38594 unsigned BitWidth = Known.getBitWidth();
38595 unsigned NumElts = DemandedElts.getBitWidth();
38596 unsigned Opc = Op.getOpcode();
38597 EVT VT = Op.getValueType();
38602 "Should use MaskedValueIsZero if you don't know whether Op"
38603 " is a target node!");
38604
38605 Known.resetAll();
38606 switch (Opc) {
38607 default: break;
38608 case X86ISD::MUL_IMM: {
38609 KnownBits Known2;
38610 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38611 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38612 Known = KnownBits::mul(Known, Known2);
38613 break;
38614 }
38615 case X86ISD::BSF: {
38617
38618 KnownBits Known2;
38619 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38620 if (Known2.isNonZero()) {
38621 // If we have a known 1, its position is our upper bound.
38622 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38623 unsigned LowBits = llvm::bit_width(PossibleTZ);
38624 Known.Zero.setBitsFrom(LowBits);
38625 } else if (!Op.getOperand(0).isUndef()) {
38626 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38627 Known = Known.intersectWith(Known2);
38628 }
38629 break;
38630 }
38631 case X86ISD::BSR: {
38632 // TODO: Bound with input known bits?
38634
38635 if (!Op.getOperand(0).isUndef() &&
38636 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38637 KnownBits Known2;
38638 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38639 Known = Known.intersectWith(Known2);
38640 }
38641 break;
38642 }
38643 case X86ISD::SETCC:
38644 Known.Zero.setBitsFrom(1);
38645 break;
38646 case X86ISD::MOVMSK: {
38647 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38648 Known.Zero.setBitsFrom(NumLoBits);
38649 break;
38650 }
38651 case X86ISD::PEXTRB:
38652 case X86ISD::PEXTRW: {
38653 SDValue Src = Op.getOperand(0);
38654 EVT SrcVT = Src.getValueType();
38655 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38656 Op.getConstantOperandVal(1));
38657 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38658 Known = Known.anyextOrTrunc(BitWidth);
38659 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38660 break;
38661 }
38662 case X86ISD::VSRAI:
38663 case X86ISD::VSHLI:
38664 case X86ISD::VSRLI: {
38665 unsigned ShAmt = Op.getConstantOperandVal(1);
38666 if (ShAmt >= VT.getScalarSizeInBits()) {
38667 // Out of range logical bit shifts are guaranteed to be zero.
38668 // Out of range arithmetic bit shifts splat the sign bit.
38669 if (Opc != X86ISD::VSRAI) {
38670 Known.setAllZero();
38671 break;
38672 }
38673
38674 ShAmt = VT.getScalarSizeInBits() - 1;
38675 }
38676
38677 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38678 if (Opc == X86ISD::VSHLI) {
38679 Known <<= ShAmt;
38680 // Low bits are known zero.
38681 Known.Zero.setLowBits(ShAmt);
38682 } else if (Opc == X86ISD::VSRLI) {
38683 Known >>= ShAmt;
38684 // High bits are known zero.
38685 Known.Zero.setHighBits(ShAmt);
38686 } else {
38687 Known.Zero.ashrInPlace(ShAmt);
38688 Known.One.ashrInPlace(ShAmt);
38689 }
38690 break;
38691 }
38692 case X86ISD::PACKUS: {
38693 // PACKUS is just a truncation if the upper half is zero.
38694 APInt DemandedLHS, DemandedRHS;
38695 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38696
38697 Known.One = APInt::getAllOnes(BitWidth * 2);
38698 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38699
38700 KnownBits Known2;
38701 if (!!DemandedLHS) {
38702 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38703 Known = Known.intersectWith(Known2);
38704 }
38705 if (!!DemandedRHS) {
38706 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38707 Known = Known.intersectWith(Known2);
38708 }
38709
38710 if (Known.countMinLeadingZeros() < BitWidth)
38711 Known.resetAll();
38712 Known = Known.trunc(BitWidth);
38713 break;
38714 }
38715 case X86ISD::PSHUFB: {
38716 SDValue Src = Op.getOperand(0);
38717 SDValue Idx = Op.getOperand(1);
38718
38719 // If the index vector is never negative (MSB is zero), then all elements
38720 // come from the source vector. This is useful for cases where
38721 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38722 // below will handle the more common constant shuffle mask case.
38723 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38724 if (KnownIdx.isNonNegative())
38725 Known = DAG.computeKnownBits(Src, Depth + 1);
38726 break;
38727 }
38728 case X86ISD::VBROADCAST: {
38729 SDValue Src = Op.getOperand(0);
38730 if (!Src.getSimpleValueType().isVector()) {
38731 Known = DAG.computeKnownBits(Src, Depth + 1);
38732 return;
38733 }
38734 break;
38735 }
38736 case X86ISD::AND: {
38737 if (Op.getResNo() == 0) {
38738 KnownBits Known2;
38739 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38740 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38741 Known &= Known2;
38742 }
38743 break;
38744 }
38745 case X86ISD::ANDNP: {
38746 KnownBits Known2;
38747 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38748 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38749
38750 // ANDNP = (~X & Y);
38751 Known.One &= Known2.Zero;
38752 Known.Zero |= Known2.One;
38753 break;
38754 }
38755 case X86ISD::FOR: {
38756 KnownBits Known2;
38757 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38758 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38759
38760 Known |= Known2;
38761 break;
38762 }
38763 case X86ISD::PSADBW: {
38764 SDValue LHS = Op.getOperand(0);
38765 SDValue RHS = Op.getOperand(1);
38766 assert(VT.getScalarType() == MVT::i64 &&
38767 LHS.getValueType() == RHS.getValueType() &&
38768 LHS.getValueType().getScalarType() == MVT::i8 &&
38769 "Unexpected PSADBW types");
38770 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38771 break;
38772 }
38773 case X86ISD::PCMPGT:
38774 case X86ISD::PCMPEQ: {
38775 KnownBits KnownLhs =
38776 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38777 KnownBits KnownRhs =
38778 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38779 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38780 ? KnownBits::eq(KnownLhs, KnownRhs)
38781 : KnownBits::sgt(KnownLhs, KnownRhs);
38782 if (Res) {
38783 if (*Res)
38784 Known.setAllOnes();
38785 else
38786 Known.setAllZero();
38787 }
38788 break;
38789 }
38790 case X86ISD::VPMADDWD: {
38791 SDValue LHS = Op.getOperand(0);
38792 SDValue RHS = Op.getOperand(1);
38793 assert(VT.getVectorElementType() == MVT::i32 &&
38794 LHS.getValueType() == RHS.getValueType() &&
38795 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38796 "Unexpected PMADDWD types");
38797 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38798 break;
38799 }
38800 case X86ISD::VPMADDUBSW: {
38801 SDValue LHS = Op.getOperand(0);
38802 SDValue RHS = Op.getOperand(1);
38803 assert(VT.getVectorElementType() == MVT::i16 &&
38804 LHS.getValueType() == RHS.getValueType() &&
38805 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38806 "Unexpected PMADDUBSW types");
38807 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38808 break;
38809 }
38810 case X86ISD::PMULUDQ: {
38811 KnownBits Known2;
38812 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38813 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38814
38815 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38816 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38817 Known = KnownBits::mul(Known, Known2);
38818 break;
38819 }
38820 case X86ISD::CMOV: {
38821 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38822 // If we don't know any bits, early out.
38823 if (Known.isUnknown())
38824 break;
38825 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38826
38827 // Only known if known in both the LHS and RHS.
38828 Known = Known.intersectWith(Known2);
38829 break;
38830 }
38831 case X86ISD::BEXTR:
38832 case X86ISD::BEXTRI: {
38833 SDValue Op0 = Op.getOperand(0);
38834 SDValue Op1 = Op.getOperand(1);
38835
38836 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38837 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38838 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38839
38840 // If the length is 0, the result is 0.
38841 if (Length == 0) {
38842 Known.setAllZero();
38843 break;
38844 }
38845
38846 if ((Shift + Length) <= BitWidth) {
38847 Known = DAG.computeKnownBits(Op0, Depth + 1);
38848 Known = Known.extractBits(Length, Shift);
38849 Known = Known.zextOrTrunc(BitWidth);
38850 }
38851 }
38852 break;
38853 }
38854 case X86ISD::PDEP: {
38855 KnownBits Known2;
38856 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38857 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38858 // Zeros are retained from the mask operand. But not ones.
38859 Known.One.clearAllBits();
38860 // The result will have at least as many trailing zeros as the non-mask
38861 // operand since bits can only map to the same or higher bit position.
38862 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38863 break;
38864 }
38865 case X86ISD::PEXT: {
38866 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38867 // The result has as many leading zeros as the number of zeroes in the mask.
38868 unsigned Count = Known.Zero.popcount();
38869 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38870 Known.One.clearAllBits();
38871 break;
38872 }
38873 case X86ISD::VTRUNC:
38874 case X86ISD::VTRUNCS:
38875 case X86ISD::VTRUNCUS:
38876 case X86ISD::CVTSI2P:
38877 case X86ISD::CVTUI2P:
38878 case X86ISD::CVTP2SI:
38879 case X86ISD::CVTP2UI:
38880 case X86ISD::MCVTP2SI:
38881 case X86ISD::MCVTP2UI:
38882 case X86ISD::CVTTP2SI:
38883 case X86ISD::CVTTP2UI:
38884 case X86ISD::MCVTTP2SI:
38885 case X86ISD::MCVTTP2UI:
38886 case X86ISD::MCVTSI2P:
38887 case X86ISD::MCVTUI2P:
38888 case X86ISD::VFPROUND:
38889 case X86ISD::VMFPROUND:
38890 case X86ISD::CVTPS2PH:
38891 case X86ISD::MCVTPS2PH:
38892 case X86ISD::MCVTTP2SIS:
38893 case X86ISD::MCVTTP2UIS: {
38894 // Truncations/Conversions - upper elements are known zero.
38895 EVT SrcVT = Op.getOperand(0).getValueType();
38896 if (SrcVT.isVector()) {
38897 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38898 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38899 Known.setAllZero();
38900 }
38901 break;
38902 }
38909 // Strict Conversions - upper elements are known zero.
38910 EVT SrcVT = Op.getOperand(1).getValueType();
38911 if (SrcVT.isVector()) {
38912 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38913 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38914 Known.setAllZero();
38915 }
38916 break;
38917 }
38918 case X86ISD::MOVQ2DQ: {
38919 // Move from MMX to XMM. Upper half of XMM should be 0.
38920 if (DemandedElts.countr_zero() >= (NumElts / 2))
38921 Known.setAllZero();
38922 break;
38923 }
38925 APInt UndefElts;
38926 SmallVector<APInt, 16> EltBits;
38927 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38928 /*AllowWholeUndefs*/ false,
38929 /*AllowPartialUndefs*/ false)) {
38930 Known.Zero.setAllBits();
38931 Known.One.setAllBits();
38932 for (unsigned I = 0; I != NumElts; ++I) {
38933 if (!DemandedElts[I])
38934 continue;
38935 if (UndefElts[I]) {
38936 Known.resetAll();
38937 break;
38938 }
38939 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38940 Known = Known.intersectWith(Known2);
38941 }
38942 return;
38943 }
38944 break;
38945 }
38946 case X86ISD::HADD:
38947 case X86ISD::HSUB: {
38949 Op, DemandedElts, Depth, DAG,
38950 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38952 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38953 KnownLHS, KnownRHS);
38954 });
38955 break;
38956 }
38958 switch (Op->getConstantOperandVal(0)) {
38959 case Intrinsic::x86_sse2_pmadd_wd:
38960 case Intrinsic::x86_avx2_pmadd_wd:
38961 case Intrinsic::x86_avx512_pmaddw_d_512: {
38962 SDValue LHS = Op.getOperand(1);
38963 SDValue RHS = Op.getOperand(2);
38964 assert(VT.getScalarType() == MVT::i32 &&
38965 LHS.getValueType() == RHS.getValueType() &&
38966 LHS.getValueType().getScalarType() == MVT::i16 &&
38967 "Unexpected PMADDWD types");
38968 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38969 break;
38970 }
38971 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38972 case Intrinsic::x86_avx2_pmadd_ub_sw:
38973 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38974 SDValue LHS = Op.getOperand(1);
38975 SDValue RHS = Op.getOperand(2);
38976 assert(VT.getScalarType() == MVT::i16 &&
38977 LHS.getValueType() == RHS.getValueType() &&
38978 LHS.getValueType().getScalarType() == MVT::i8 &&
38979 "Unexpected PMADDUBSW types");
38980 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38981 break;
38982 }
38983 case Intrinsic::x86_sse2_psad_bw:
38984 case Intrinsic::x86_avx2_psad_bw:
38985 case Intrinsic::x86_avx512_psad_bw_512: {
38986 SDValue LHS = Op.getOperand(1);
38987 SDValue RHS = Op.getOperand(2);
38988 assert(VT.getScalarType() == MVT::i64 &&
38989 LHS.getValueType() == RHS.getValueType() &&
38990 LHS.getValueType().getScalarType() == MVT::i8 &&
38991 "Unexpected PSADBW types");
38992 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38993 break;
38994 }
38995 }
38996 break;
38997 }
38998 }
38999
39000 // Handle target shuffles.
39001 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39002 if (isTargetShuffle(Opc)) {
39005 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39006 unsigned NumOps = Ops.size();
39007 unsigned NumElts = VT.getVectorNumElements();
39008 if (Mask.size() == NumElts) {
39009 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39010 Known.Zero.setAllBits(); Known.One.setAllBits();
39011 for (unsigned i = 0; i != NumElts; ++i) {
39012 if (!DemandedElts[i])
39013 continue;
39014 int M = Mask[i];
39015 if (M == SM_SentinelUndef) {
39016 // For UNDEF elements, we don't know anything about the common state
39017 // of the shuffle result.
39018 Known.resetAll();
39019 break;
39020 }
39021 if (M == SM_SentinelZero) {
39022 Known.One.clearAllBits();
39023 continue;
39024 }
39025 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39026 "Shuffle index out of range");
39027
39028 unsigned OpIdx = (unsigned)M / NumElts;
39029 unsigned EltIdx = (unsigned)M % NumElts;
39030 if (Ops[OpIdx].getValueType() != VT) {
39031 // TODO - handle target shuffle ops with different value types.
39032 Known.resetAll();
39033 break;
39034 }
39035 DemandedOps[OpIdx].setBit(EltIdx);
39036 }
39037 // Known bits are the values that are shared by every demanded element.
39038 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39039 if (!DemandedOps[i])
39040 continue;
39041 KnownBits Known2 =
39042 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39043 Known = Known.intersectWith(Known2);
39044 }
39045 }
39046 }
39047 }
39048}
39049
39051 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39052 unsigned Depth) const {
39053 EVT VT = Op.getValueType();
39054 unsigned VTBits = VT.getScalarSizeInBits();
39055 unsigned Opcode = Op.getOpcode();
39056 switch (Opcode) {
39058 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39059 return VTBits;
39060
39061 case X86ISD::VTRUNC: {
39062 SDValue Src = Op.getOperand(0);
39063 MVT SrcVT = Src.getSimpleValueType();
39064 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39065 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39066 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39067 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39068 if (Tmp > (NumSrcBits - VTBits))
39069 return Tmp - (NumSrcBits - VTBits);
39070 return 1;
39071 }
39072
39073 case X86ISD::PACKSS: {
39074 // PACKSS is just a truncation if the sign bits extend to the packed size.
39075 APInt DemandedLHS, DemandedRHS;
39076 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39077 DemandedRHS);
39078
39079 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39080 // patterns often used to compact vXi64 allsignbit patterns.
39081 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39083 if (BC.getOpcode() == X86ISD::PACKSS &&
39084 BC.getScalarValueSizeInBits() == 16 &&
39085 V.getScalarValueSizeInBits() == 32) {
39088 if (BC0.getScalarValueSizeInBits() == 64 &&
39089 BC1.getScalarValueSizeInBits() == 64 &&
39090 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39091 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39092 return 32;
39093 }
39094 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39095 };
39096
39097 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39098 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39099 if (!!DemandedLHS)
39100 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39101 if (!!DemandedRHS)
39102 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39103 unsigned Tmp = std::min(Tmp0, Tmp1);
39104 if (Tmp > (SrcBits - VTBits))
39105 return Tmp - (SrcBits - VTBits);
39106 return 1;
39107 }
39108
39109 case X86ISD::VBROADCAST: {
39110 SDValue Src = Op.getOperand(0);
39111 if (!Src.getSimpleValueType().isVector())
39112 return DAG.ComputeNumSignBits(Src, Depth + 1);
39113 break;
39114 }
39115
39116 case X86ISD::VSHLI: {
39117 SDValue Src = Op.getOperand(0);
39118 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39119 if (ShiftVal.uge(VTBits))
39120 return VTBits; // Shifted all bits out --> zero.
39121 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39122 if (ShiftVal.uge(Tmp))
39123 return 1; // Shifted all sign bits out --> unknown.
39124 return Tmp - ShiftVal.getZExtValue();
39125 }
39126
39127 case X86ISD::VSRAI: {
39128 SDValue Src = Op.getOperand(0);
39129 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39130 if (ShiftVal.uge(VTBits - 1))
39131 return VTBits; // Sign splat.
39132 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39133 ShiftVal += Tmp;
39134 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39135 }
39136
39137 case X86ISD::FSETCC:
39138 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39139 if (VT == MVT::f32 || VT == MVT::f64 ||
39140 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39141 return VTBits;
39142 break;
39143
39144 case X86ISD::PCMPGT:
39145 case X86ISD::PCMPEQ:
39146 case X86ISD::CMPP:
39147 case X86ISD::VPCOM:
39148 case X86ISD::VPCOMU:
39149 // Vector compares return zero/all-bits result values.
39150 return VTBits;
39151
39152 case X86ISD::ANDNP: {
39153 unsigned Tmp0 =
39154 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39155 if (Tmp0 == 1) return 1; // Early out.
39156 unsigned Tmp1 =
39157 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39158 return std::min(Tmp0, Tmp1);
39159 }
39160
39161 case X86ISD::CMOV: {
39162 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39163 if (Tmp0 == 1) return 1; // Early out.
39164 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39165 return std::min(Tmp0, Tmp1);
39166 }
39167 }
39168
39169 // Handle target shuffles.
39170 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39171 if (isTargetShuffle(Opcode)) {
39174 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39175 unsigned NumOps = Ops.size();
39176 unsigned NumElts = VT.getVectorNumElements();
39177 if (Mask.size() == NumElts) {
39178 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39179 for (unsigned i = 0; i != NumElts; ++i) {
39180 if (!DemandedElts[i])
39181 continue;
39182 int M = Mask[i];
39183 if (M == SM_SentinelUndef) {
39184 // For UNDEF elements, we don't know anything about the common state
39185 // of the shuffle result.
39186 return 1;
39187 } else if (M == SM_SentinelZero) {
39188 // Zero = all sign bits.
39189 continue;
39190 }
39191 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39192 "Shuffle index out of range");
39193
39194 unsigned OpIdx = (unsigned)M / NumElts;
39195 unsigned EltIdx = (unsigned)M % NumElts;
39196 if (Ops[OpIdx].getValueType() != VT) {
39197 // TODO - handle target shuffle ops with different value types.
39198 return 1;
39199 }
39200 DemandedOps[OpIdx].setBit(EltIdx);
39201 }
39202 unsigned Tmp0 = VTBits;
39203 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39204 if (!DemandedOps[i])
39205 continue;
39206 unsigned Tmp1 =
39207 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39208 Tmp0 = std::min(Tmp0, Tmp1);
39209 }
39210 return Tmp0;
39211 }
39212 }
39213 }
39214
39215 // Fallback case.
39216 return 1;
39217}
39218
39220 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39221 return N->getOperand(0);
39222 return N;
39223}
39224
39225// Helper to look for a normal load that can be narrowed into a vzload with the
39226// specified VT and memory VT. Returns SDValue() on failure.
39228 SelectionDAG &DAG) {
39229 // Can't if the load is volatile or atomic.
39230 if (!LN->isSimple())
39231 return SDValue();
39232
39233 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39234 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39235 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39236 LN->getPointerInfo(), LN->getBaseAlign(),
39237 LN->getMemOperand()->getFlags());
39238}
39239
39240// Attempt to match a combined shuffle mask against supported unary shuffle
39241// instructions.
39242// TODO: Investigate sharing more of this with shuffle lowering.
39243static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39244 bool AllowFloatDomain, bool AllowIntDomain,
39245 SDValue V1, const SelectionDAG &DAG,
39246 const X86Subtarget &Subtarget, unsigned &Shuffle,
39247 MVT &SrcVT, MVT &DstVT) {
39248 unsigned NumMaskElts = Mask.size();
39249 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39250
39251 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39252 if (Mask[0] == 0 &&
39253 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39254 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39256 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39257 Shuffle = X86ISD::VZEXT_MOVL;
39258 if (MaskEltSize == 16)
39259 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39260 else
39261 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39262 return true;
39263 }
39264 }
39265
39266 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39267 if (AllowIntDomain &&
39268 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39269 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39270 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39271 unsigned MaxScale = 64 / MaskEltSize;
39272 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39273 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39274 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39275 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39276 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39277 continue;
39278 bool MatchAny = true;
39279 bool MatchZero = true;
39280 bool MatchSign = UseSign;
39281 unsigned NumDstElts = NumMaskElts / Scale;
39282 for (unsigned i = 0;
39283 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39284 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39285 MatchAny = MatchSign = MatchZero = false;
39286 break;
39287 }
39288 unsigned Pos = (i * Scale) + 1;
39289 unsigned Len = Scale - 1;
39290 MatchAny &= isUndefInRange(Mask, Pos, Len);
39291 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39292 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39293 }
39294 if (MatchAny || MatchSign || MatchZero) {
39295 assert((MatchSign || MatchZero) &&
39296 "Failed to match sext/zext but matched aext?");
39297 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39298 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39299 : MVT::getIntegerVT(MaskEltSize);
39300 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39301
39302 Shuffle = unsigned(
39303 MatchAny ? ISD::ANY_EXTEND
39304 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39305 if (SrcVT.getVectorNumElements() != NumDstElts)
39306 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39307
39308 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39309 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39310 return true;
39311 }
39312 }
39313 }
39314
39315 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39316 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39317 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39318 isUndefOrEqual(Mask[0], 0) &&
39319 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39320 Shuffle = X86ISD::VZEXT_MOVL;
39321 if (MaskEltSize == 16)
39322 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39323 else
39324 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39325 return true;
39326 }
39327
39328 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39329 // instructions are no slower than UNPCKLPD but has the option to
39330 // fold the input operand into even an unaligned memory load.
39331 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39332 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39333 Shuffle = X86ISD::MOVDDUP;
39334 SrcVT = DstVT = MVT::v2f64;
39335 return true;
39336 }
39337 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39338 Shuffle = X86ISD::MOVSLDUP;
39339 SrcVT = DstVT = MVT::v4f32;
39340 return true;
39341 }
39342 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39343 Shuffle = X86ISD::MOVSHDUP;
39344 SrcVT = DstVT = MVT::v4f32;
39345 return true;
39346 }
39347 }
39348
39349 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39350 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39351 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39352 Shuffle = X86ISD::MOVDDUP;
39353 SrcVT = DstVT = MVT::v4f64;
39354 return true;
39355 }
39356 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39357 V1)) {
39358 Shuffle = X86ISD::MOVSLDUP;
39359 SrcVT = DstVT = MVT::v8f32;
39360 return true;
39361 }
39362 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39363 V1)) {
39364 Shuffle = X86ISD::MOVSHDUP;
39365 SrcVT = DstVT = MVT::v8f32;
39366 return true;
39367 }
39368 }
39369
39370 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39371 assert(Subtarget.hasAVX512() &&
39372 "AVX512 required for 512-bit vector shuffles");
39373 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39374 V1)) {
39375 Shuffle = X86ISD::MOVDDUP;
39376 SrcVT = DstVT = MVT::v8f64;
39377 return true;
39378 }
39380 MaskVT, Mask,
39381 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39382 Shuffle = X86ISD::MOVSLDUP;
39383 SrcVT = DstVT = MVT::v16f32;
39384 return true;
39385 }
39387 MaskVT, Mask,
39388 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39389 Shuffle = X86ISD::MOVSHDUP;
39390 SrcVT = DstVT = MVT::v16f32;
39391 return true;
39392 }
39393 }
39394
39395 return false;
39396}
39397
39398// Attempt to match a combined shuffle mask against supported unary immediate
39399// permute instructions.
39400// TODO: Investigate sharing more of this with shuffle lowering.
39402 const APInt &Zeroable,
39403 bool AllowFloatDomain, bool AllowIntDomain,
39404 const SelectionDAG &DAG,
39405 const X86Subtarget &Subtarget,
39406 unsigned &Shuffle, MVT &ShuffleVT,
39407 unsigned &PermuteImm) {
39408 unsigned NumMaskElts = Mask.size();
39409 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39410 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39411 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39412 bool ContainsZeros = isAnyZero(Mask);
39413
39414 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39415 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39416 // Check for lane crossing permutes.
39417 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39418 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39419 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39420 Shuffle = X86ISD::VPERMI;
39421 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39422 PermuteImm = getV4X86ShuffleImm(Mask);
39423 return true;
39424 }
39425 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39426 SmallVector<int, 4> RepeatedMask;
39427 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39428 Shuffle = X86ISD::VPERMI;
39429 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39430 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39431 return true;
39432 }
39433 }
39434 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39435 // VPERMILPD can permute with a non-repeating shuffle.
39436 Shuffle = X86ISD::VPERMILPI;
39437 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39438 PermuteImm = 0;
39439 for (int i = 0, e = Mask.size(); i != e; ++i) {
39440 int M = Mask[i];
39441 if (M == SM_SentinelUndef)
39442 continue;
39443 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39444 PermuteImm |= (M & 1) << i;
39445 }
39446 return true;
39447 }
39448 }
39449
39450 // We are checking for shuffle match or shift match. Loop twice so we can
39451 // order which we try and match first depending on target preference.
39452 for (unsigned Order = 0; Order < 2; ++Order) {
39453 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39454 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39455 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39456 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39457 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39458 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39459 SmallVector<int, 4> RepeatedMask;
39460 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39461 // Narrow the repeated mask to create 32-bit element permutes.
39462 SmallVector<int, 4> WordMask = RepeatedMask;
39463 if (MaskScalarSizeInBits == 64)
39464 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39465
39466 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39467 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39468 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39469 PermuteImm = getV4X86ShuffleImm(WordMask);
39470 return true;
39471 }
39472 }
39473
39474 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39475 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39476 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39477 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39478 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39479 SmallVector<int, 4> RepeatedMask;
39480 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39481 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39482 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39483
39484 // PSHUFLW: permute lower 4 elements only.
39485 if (isUndefOrInRange(LoMask, 0, 4) &&
39486 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39487 Shuffle = X86ISD::PSHUFLW;
39488 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39489 PermuteImm = getV4X86ShuffleImm(LoMask);
39490 return true;
39491 }
39492
39493 // PSHUFHW: permute upper 4 elements only.
39494 if (isUndefOrInRange(HiMask, 4, 8) &&
39495 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39496 // Offset the HiMask so that we can create the shuffle immediate.
39497 int OffsetHiMask[4];
39498 for (int i = 0; i != 4; ++i)
39499 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39500
39501 Shuffle = X86ISD::PSHUFHW;
39502 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39503 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39504 return true;
39505 }
39506 }
39507 }
39508 } else {
39509 // Attempt to match against bit rotates.
39510 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39511 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39512 Subtarget.hasAVX512())) {
39513 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39514 Subtarget, Mask);
39515 if (0 < RotateAmt) {
39516 Shuffle = X86ISD::VROTLI;
39517 PermuteImm = (unsigned)RotateAmt;
39518 return true;
39519 }
39520 }
39521 }
39522 // Attempt to match against byte/bit shifts.
39523 if (AllowIntDomain &&
39524 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39525 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39526 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39527 int ShiftAmt =
39528 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39529 Zeroable, Subtarget);
39530 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39531 32 <= ShuffleVT.getScalarSizeInBits())) {
39532 // Byte shifts can be slower so only match them on second attempt.
39533 if (Order == 0 &&
39534 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39535 continue;
39536
39537 PermuteImm = (unsigned)ShiftAmt;
39538 return true;
39539 }
39540
39541 }
39542 }
39543
39544 return false;
39545}
39546
39547// Attempt to match a combined unary shuffle mask against supported binary
39548// shuffle instructions.
39549// TODO: Investigate sharing more of this with shuffle lowering.
39550static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39551 bool AllowFloatDomain, bool AllowIntDomain,
39552 SDValue &V1, SDValue &V2, const SDLoc &DL,
39553 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39554 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39555 bool IsUnary) {
39556 unsigned NumMaskElts = Mask.size();
39557 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39558 unsigned SizeInBits = MaskVT.getSizeInBits();
39559
39560 if (MaskVT.is128BitVector()) {
39561 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39562 AllowFloatDomain) {
39563 V2 = V1;
39564 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39565 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39566 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39567 return true;
39568 }
39569 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39570 AllowFloatDomain) {
39571 V2 = V1;
39572 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39573 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39574 return true;
39575 }
39576 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39577 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39578 std::swap(V1, V2);
39579 Shuffle = X86ISD::MOVSD;
39580 SrcVT = DstVT = MVT::v2f64;
39581 return true;
39582 }
39583 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39584 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39585 Shuffle = X86ISD::MOVSS;
39586 SrcVT = DstVT = MVT::v4f32;
39587 return true;
39588 }
39589 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39590 DAG) &&
39591 Subtarget.hasFP16()) {
39592 Shuffle = X86ISD::MOVSH;
39593 SrcVT = DstVT = MVT::v8f16;
39594 return true;
39595 }
39596 }
39597
39598 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39599 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39600 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39601 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39602 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39603 Subtarget)) {
39604 DstVT = MaskVT;
39605 return true;
39606 }
39607 }
39608 // TODO: Can we handle this inside matchShuffleWithPACK?
39609 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39610 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39611 V1.getScalarValueSizeInBits() == 64 &&
39612 V2.getScalarValueSizeInBits() == 64) {
39613 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39614 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39615 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39616 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39617 SrcVT = MVT::v4i32;
39618 DstVT = MVT::v8i16;
39619 Shuffle = X86ISD::PACKUS;
39620 return true;
39621 }
39622 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39623 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39624 SrcVT = MVT::v8i16;
39625 DstVT = MVT::v16i8;
39626 Shuffle = X86ISD::PACKUS;
39627 return true;
39628 }
39629 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39630 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39631 SrcVT = MVT::v4i32;
39632 DstVT = MVT::v8i16;
39633 Shuffle = X86ISD::PACKSS;
39634 return true;
39635 }
39636 }
39637
39638 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39639 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39640 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39641 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39642 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39643 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39644 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39645 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39646 Subtarget)) {
39647 SrcVT = DstVT = MaskVT;
39648 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39649 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39650 return true;
39651 }
39652 }
39653
39654 // Attempt to match against a OR if we're performing a blend shuffle and the
39655 // non-blended source element is zero in each case.
39656 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39657 if (SizeInBits == V1.getValueSizeInBits() &&
39658 SizeInBits == V2.getValueSizeInBits() &&
39659 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39660 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39661 bool IsBlend = true;
39662 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39663 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39664 unsigned Scale1 = NumV1Elts / NumMaskElts;
39665 unsigned Scale2 = NumV2Elts / NumMaskElts;
39666 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39667 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39668 for (unsigned i = 0; i != NumMaskElts; ++i) {
39669 int M = Mask[i];
39670 if (M == SM_SentinelUndef)
39671 continue;
39672 if (M == SM_SentinelZero) {
39673 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39674 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39675 continue;
39676 }
39677 if (M == (int)i) {
39678 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39679 continue;
39680 }
39681 if (M == (int)(i + NumMaskElts)) {
39682 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39683 continue;
39684 }
39685 IsBlend = false;
39686 break;
39687 }
39688 if (IsBlend) {
39689 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39690 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39691 Shuffle = ISD::OR;
39692 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39693 return true;
39694 }
39695 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39696 // FIXME: handle mismatched sizes?
39697 // TODO: investigate if `ISD::OR` handling in
39698 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39699 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39700 unsigned NumElts = V.getValueType().getVectorNumElements();
39701 KnownBits Known(NumElts);
39702 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39703 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39704 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39705 if (PeepholeKnown.isZero())
39706 Known.Zero.setBit(EltIdx);
39707 if (PeepholeKnown.isAllOnes())
39708 Known.One.setBit(EltIdx);
39709 }
39710 return Known;
39711 };
39712
39713 KnownBits V1Known = computeKnownBitsElementWise(V1);
39714 KnownBits V2Known = computeKnownBitsElementWise(V2);
39715
39716 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39717 int M = Mask[i];
39718 if (M == SM_SentinelUndef)
39719 continue;
39720 if (M == SM_SentinelZero) {
39721 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39722 continue;
39723 }
39724 if (M == (int)i) {
39725 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39726 continue;
39727 }
39728 if (M == (int)(i + NumMaskElts)) {
39729 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39730 continue;
39731 }
39732 llvm_unreachable("will not get here.");
39733 }
39734 if (IsBlend) {
39735 Shuffle = ISD::OR;
39736 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39737 return true;
39738 }
39739 }
39740 }
39741 }
39742
39743 return false;
39744}
39745
39747 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39748 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39749 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39750 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39751 unsigned NumMaskElts = Mask.size();
39752 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39753
39754 // Attempt to match against VALIGND/VALIGNQ rotate.
39755 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39756 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39757 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39758 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39759 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39760 MaskVT.getSizeInBits() / EltSizeInBits);
39761 if (!isAnyZero(Mask)) {
39762 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39763 if (0 < Rotation) {
39764 Shuffle = X86ISD::VALIGN;
39765 ShuffleVT = AlignVT;
39766 PermuteImm = Rotation;
39767 return true;
39768 }
39769 }
39770 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39771 unsigned ZeroLo = Zeroable.countr_one();
39772 unsigned ZeroHi = Zeroable.countl_one();
39773 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39774 if (ZeroLo) {
39775 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39776 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39777 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39778 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39779 Shuffle = X86ISD::VALIGN;
39780 ShuffleVT = AlignVT;
39781 PermuteImm = NumMaskElts - ZeroLo;
39782 return true;
39783 }
39784 }
39785 if (ZeroHi) {
39786 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39787 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39788 ZeroHi);
39789 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39790 V2 = V1;
39791 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39792 Shuffle = X86ISD::VALIGN;
39793 ShuffleVT = AlignVT;
39794 PermuteImm = ZeroHi;
39795 return true;
39796 }
39797 }
39798 }
39799
39800 // Attempt to match against PALIGNR byte rotate.
39801 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39802 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39803 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39804 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39805 if (0 < ByteRotation) {
39806 Shuffle = X86ISD::PALIGNR;
39807 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39808 PermuteImm = ByteRotation;
39809 return true;
39810 }
39811 }
39812
39813 // Attempt to combine to X86ISD::BLENDI.
39814 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39815 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39816 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39817 uint64_t BlendMask = 0;
39818 bool ForceV1Zero = false, ForceV2Zero = false;
39819 SmallVector<int, 8> TargetMask(Mask);
39820 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39821 ForceV2Zero, BlendMask)) {
39822 if (MaskVT == MVT::v16i16) {
39823 // We can only use v16i16 PBLENDW if the lanes are repeated.
39824 SmallVector<int, 8> RepeatedMask;
39825 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39826 RepeatedMask)) {
39827 assert(RepeatedMask.size() == 8 &&
39828 "Repeated mask size doesn't match!");
39829 PermuteImm = 0;
39830 for (int i = 0; i < 8; ++i)
39831 if (RepeatedMask[i] >= 8)
39832 PermuteImm |= 1 << i;
39833 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39834 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39835 Shuffle = X86ISD::BLENDI;
39836 ShuffleVT = MaskVT;
39837 return true;
39838 }
39839 } else {
39840 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39841 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39842 PermuteImm = (unsigned)BlendMask;
39843 Shuffle = X86ISD::BLENDI;
39844 ShuffleVT = MaskVT;
39845 return true;
39846 }
39847 }
39848 }
39849
39850 // Attempt to combine to INSERTPS, but only if it has elements that need to
39851 // be set to zero.
39852 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39853 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39854 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39855 Shuffle = X86ISD::INSERTPS;
39856 ShuffleVT = MVT::v4f32;
39857 return true;
39858 }
39859
39860 // Attempt to combine to SHUFPD.
39861 if (AllowFloatDomain && EltSizeInBits == 64 &&
39862 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39863 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39864 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39865 bool ForceV1Zero = false, ForceV2Zero = false;
39866 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39867 PermuteImm, Mask, Zeroable)) {
39868 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39869 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39870 Shuffle = X86ISD::SHUFP;
39871 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39872 return true;
39873 }
39874 }
39875
39876 // Attempt to combine to SHUFPS.
39877 if (AllowFloatDomain && EltSizeInBits == 32 &&
39878 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39879 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39880 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39881 SmallVector<int, 4> RepeatedMask;
39882 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39883 // Match each half of the repeated mask, to determine if its just
39884 // referencing one of the vectors, is zeroable or entirely undef.
39885 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39886 int M0 = RepeatedMask[Offset];
39887 int M1 = RepeatedMask[Offset + 1];
39888
39889 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39890 return DAG.getUNDEF(MaskVT);
39891 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39892 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39893 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39894 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39895 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39896 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39897 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39898 return V1;
39899 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39900 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39901 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39902 return V2;
39903 }
39904
39905 return SDValue();
39906 };
39907
39908 int ShufMask[4] = {-1, -1, -1, -1};
39909 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39910 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39911
39912 if (Lo && Hi) {
39913 V1 = Lo;
39914 V2 = Hi;
39915 Shuffle = X86ISD::SHUFP;
39916 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39917 PermuteImm = getV4X86ShuffleImm(ShufMask);
39918 return true;
39919 }
39920 }
39921 }
39922
39923 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39924 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39925 MaskVT.is128BitVector() &&
39926 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39927 Shuffle = X86ISD::INSERTPS;
39928 ShuffleVT = MVT::v4f32;
39929 return true;
39930 }
39931
39932 return false;
39933}
39934
39936 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39937 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39938 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39939 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39940 const X86Subtarget &Subtarget);
39941
39942/// Combine an arbitrary chain of shuffles into a single instruction if
39943/// possible.
39944///
39945/// This is the leaf of the recursive combine below. When we have found some
39946/// chain of single-use x86 shuffle instructions and accumulated the combined
39947/// shuffle mask represented by them, this will try to pattern match that mask
39948/// into either a single instruction if there is a special purpose instruction
39949/// for this operation, or into a PSHUFB instruction which is a fully general
39950/// instruction but should only be used to replace chains over a certain depth.
39952 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39953 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39954 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39955 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39956 const X86Subtarget &Subtarget) {
39957 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39958 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39959 "Unexpected number of shuffle inputs!");
39960 unsigned RootSizeInBits = RootVT.getSizeInBits();
39961 unsigned NumRootElts = RootVT.getVectorNumElements();
39962
39963 // Canonicalize shuffle input op to the requested type.
39964 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39965 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39966 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39967 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39968 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39969 return DAG.getBitcast(VT, Op);
39970 };
39971
39972 // Find the inputs that enter the chain. Note that multiple uses are OK
39973 // here, we're not going to remove the operands we find.
39974 bool UnaryShuffle = (Inputs.size() == 1);
39975 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39976 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39977 : peekThroughBitcasts(Inputs[1]));
39978
39979 MVT VT1 = V1.getSimpleValueType();
39980 MVT VT2 = V2.getSimpleValueType();
39981 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39982 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39983
39984 SDValue Res;
39985
39986 unsigned NumBaseMaskElts = BaseMask.size();
39987 if (NumBaseMaskElts == 1) {
39988 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39989 return CanonicalizeShuffleInput(RootVT, V1);
39990 }
39991
39992 bool OptForSize = DAG.shouldOptForSize();
39993 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39994 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39995 (RootVT.isFloatingPoint() && Depth >= 1) ||
39996 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39997
39998 // If we are shuffling a splat (and not introducing zeros) then we can just
39999 // use it directly. This works for smaller elements as well as they already
40000 // repeat across each mask element.
40001 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40002 V1.getValueSizeInBits() >= RootSizeInBits &&
40003 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40004 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40005 return CanonicalizeShuffleInput(RootVT, V1);
40006 }
40007
40008 SmallVector<int, 64> Mask(BaseMask);
40009
40010 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40011 // etc. can be simplified.
40012 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40013 SmallVector<int> ScaledMask, IdentityMask;
40014 unsigned NumElts = VT1.getVectorNumElements();
40015 if (Mask.size() <= NumElts &&
40016 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40017 for (unsigned i = 0; i != NumElts; ++i)
40018 IdentityMask.push_back(i);
40019 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40020 V2))
40021 return CanonicalizeShuffleInput(RootVT, V1);
40022 }
40023 }
40024
40025 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40026 if (RootVT.is512BitVector() &&
40027 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40028 // If the upper subvectors are zeroable, then an extract+insert is more
40029 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40030 // to zero the upper subvectors.
40031 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40032 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40033 return SDValue(); // Nothing to do!
40034 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40035 "Unexpected lane shuffle");
40036 Res = CanonicalizeShuffleInput(RootVT, V1);
40037 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40038 bool UseZero = isAnyZero(Mask);
40039 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40040 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40041 }
40042
40043 // Narrow shuffle mask to v4x128.
40044 SmallVector<int, 4> ScaledMask;
40045 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40046 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40047
40048 // Try to lower to vshuf64x2/vshuf32x4.
40049 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40050 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40051 SelectionDAG &DAG) {
40052 int PermMask[4] = {-1, -1, -1, -1};
40053 // Ensure elements came from the same Op.
40054 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40055 for (int i = 0; i < 4; ++i) {
40056 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40057 if (ScaledMask[i] < 0)
40058 continue;
40059
40060 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40061 unsigned OpIndex = i / 2;
40062 if (Ops[OpIndex].isUndef())
40063 Ops[OpIndex] = Op;
40064 else if (Ops[OpIndex] != Op)
40065 return SDValue();
40066
40067 PermMask[i] = ScaledMask[i] % 4;
40068 }
40069
40070 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40071 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40072 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40073 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40074 };
40075
40076 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40077 // doesn't work because our mask is for 128 bits and we don't have an MVT
40078 // to match that.
40079 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40080 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40081 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40082 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40083 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40084 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40085 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40086 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40087 ScaledMask[1] == (ScaledMask[3] % 2));
40088
40089 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40090 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40091 return SDValue(); // Nothing to do!
40092 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40093 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40094 return DAG.getBitcast(RootVT, V);
40095 }
40096 }
40097
40098 // Handle 128-bit lane shuffles of 256-bit vectors.
40099 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40100 // If the upper half is zeroable, then an extract+insert is more optimal
40101 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40102 // zero the upper half.
40103 if (isUndefOrZero(Mask[1])) {
40104 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40105 return SDValue(); // Nothing to do!
40106 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40107 Res = CanonicalizeShuffleInput(RootVT, V1);
40108 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40109 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40110 256);
40111 }
40112
40113 // If we're inserting the low subvector, an insert-subvector 'concat'
40114 // pattern is quicker than VPERM2X128.
40115 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40116 !Subtarget.hasAVX2()) {
40117 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40118 return SDValue(); // Nothing to do!
40119 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40120 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40121 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40122 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40123 }
40124
40125 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40126 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40127 // feature.
40128 // Prefer blends for sequential shuffles unless we are optimizing for size.
40129 if (UnaryShuffle &&
40130 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40131 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40132 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40133 return SDValue(); // Nothing to do!
40134 unsigned PermMask = 0;
40135 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40136 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40137 return DAG.getNode(
40138 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40139 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40140 }
40141
40142 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40143 return SDValue(); // Nothing to do!
40144
40145 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40146 if (!UnaryShuffle && !IsMaskedShuffle) {
40147 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40148 "Unexpected shuffle sentinel value");
40149 // Prefer blends to X86ISD::VPERM2X128.
40150 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40151 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40152 return SDValue(); // Nothing to do!
40153 unsigned PermMask = 0;
40154 PermMask |= ((Mask[0] & 3) << 0);
40155 PermMask |= ((Mask[1] & 3) << 4);
40156 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40157 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40158 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40159 CanonicalizeShuffleInput(RootVT, LHS),
40160 CanonicalizeShuffleInput(RootVT, RHS),
40161 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40162 }
40163 }
40164 }
40165
40166 // For masks that have been widened to 128-bit elements or more,
40167 // narrow back down to 64-bit elements.
40168 if (BaseMaskEltSizeInBits > 64) {
40169 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40170 int MaskScale = BaseMaskEltSizeInBits / 64;
40171 SmallVector<int, 64> ScaledMask;
40172 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40173 Mask = std::move(ScaledMask);
40174 }
40175
40176 // For masked shuffles, we're trying to match the root width for better
40177 // writemask folding, attempt to scale the mask.
40178 // TODO - variable shuffles might need this to be widened again.
40179 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40180 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40181 int MaskScale = NumRootElts / Mask.size();
40182 SmallVector<int, 64> ScaledMask;
40183 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40184 Mask = std::move(ScaledMask);
40185 }
40186
40187 unsigned NumMaskElts = Mask.size();
40188 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40189 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40190
40191 // Determine the effective mask value type.
40192 FloatDomain &= (32 <= MaskEltSizeInBits);
40193 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40194 : MVT::getIntegerVT(MaskEltSizeInBits);
40195 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40196
40197 // Only allow legal mask types.
40198 if (!TLI.isTypeLegal(MaskVT))
40199 return SDValue();
40200
40201 // Attempt to match the mask against known shuffle patterns.
40202 MVT ShuffleSrcVT, ShuffleVT;
40203 unsigned Shuffle, PermuteImm;
40204
40205 // Which shuffle domains are permitted?
40206 // Permit domain crossing at higher combine depths.
40207 // TODO: Should we indicate which domain is preferred if both are allowed?
40208 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40209 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40210 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40211
40212 // Determine zeroable mask elements.
40213 APInt KnownUndef, KnownZero;
40214 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40215 APInt Zeroable = KnownUndef | KnownZero;
40216
40217 if (UnaryShuffle) {
40218 // Attempt to match against broadcast-from-vector.
40219 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40220 if ((Subtarget.hasAVX2() ||
40221 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40222 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40223 if (isUndefOrEqual(Mask, 0)) {
40224 if (V1.getValueType() == MaskVT &&
40226 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40227 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40228 return SDValue(); // Nothing to do!
40229 Res = V1.getOperand(0);
40230 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40231 return DAG.getBitcast(RootVT, Res);
40232 }
40233 if (Subtarget.hasAVX2()) {
40234 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40235 return SDValue(); // Nothing to do!
40236 Res = CanonicalizeShuffleInput(MaskVT, V1);
40237 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40238 return DAG.getBitcast(RootVT, Res);
40239 }
40240 }
40241 }
40242
40243 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40244 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40245 (!IsMaskedShuffle ||
40246 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40247 if (Depth == 0 && RootOpc == Shuffle)
40248 return SDValue(); // Nothing to do!
40249 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40250 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40251 return DAG.getBitcast(RootVT, Res);
40252 }
40253
40254 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40255 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40256 PermuteImm) &&
40257 (!IsMaskedShuffle ||
40258 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40259 if (Depth == 0 && RootOpc == Shuffle)
40260 return SDValue(); // Nothing to do!
40261 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40262 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40263 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40264 return DAG.getBitcast(RootVT, Res);
40265 }
40266 }
40267
40268 // Attempt to combine to INSERTPS, but only if the inserted element has come
40269 // from a scalar.
40270 // TODO: Handle other insertions here as well?
40271 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40272 Subtarget.hasSSE41() &&
40273 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40274 if (MaskEltSizeInBits == 32) {
40275 SDValue SrcV1 = V1, SrcV2 = V2;
40276 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40277 DAG) &&
40278 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40279 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40280 return SDValue(); // Nothing to do!
40281 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40282 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40283 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40284 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40285 return DAG.getBitcast(RootVT, Res);
40286 }
40287 }
40288 if (MaskEltSizeInBits == 64 &&
40289 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40291 V2.getScalarValueSizeInBits() <= 32) {
40292 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40293 return SDValue(); // Nothing to do!
40294 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40295 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40296 CanonicalizeShuffleInput(MVT::v4f32, V1),
40297 CanonicalizeShuffleInput(MVT::v4f32, V2),
40298 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40299 return DAG.getBitcast(RootVT, Res);
40300 }
40301 }
40302
40303 SDValue NewV1 = V1; // Save operands in case early exit happens.
40304 SDValue NewV2 = V2;
40305 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40306 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40307 ShuffleVT, UnaryShuffle) &&
40308 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40309 if (Depth == 0 && RootOpc == Shuffle)
40310 return SDValue(); // Nothing to do!
40311 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40312 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40313 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40314 return DAG.getBitcast(RootVT, Res);
40315 }
40316
40317 NewV1 = V1; // Save operands in case early exit happens.
40318 NewV2 = V2;
40319 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40320 AllowIntDomain, NewV1, NewV2, DL, DAG,
40321 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40322 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40323 if (Depth == 0 && RootOpc == Shuffle)
40324 return SDValue(); // Nothing to do!
40325 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40326 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40327 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40328 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40329 return DAG.getBitcast(RootVT, Res);
40330 }
40331
40332 // Typically from here on, we need an integer version of MaskVT.
40333 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40334 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40335
40336 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40337 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40338 uint64_t BitLen, BitIdx;
40339 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40340 Zeroable)) {
40341 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40342 return SDValue(); // Nothing to do!
40343 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40344 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40345 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40346 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40347 return DAG.getBitcast(RootVT, Res);
40348 }
40349
40350 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40351 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40352 return SDValue(); // Nothing to do!
40353 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40354 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40355 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40356 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40357 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40358 return DAG.getBitcast(RootVT, Res);
40359 }
40360 }
40361
40362 // Match shuffle against TRUNCATE patterns.
40363 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40364 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40365 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40366 Subtarget)) {
40367 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40368 ShuffleSrcVT.getVectorNumElements();
40369 unsigned Opc =
40370 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
40371 if (Depth == 0 && RootOpc == Opc)
40372 return SDValue(); // Nothing to do!
40373 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40374 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40375 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40376 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40377 return DAG.getBitcast(RootVT, Res);
40378 }
40379
40380 // Do we need a more general binary truncation pattern?
40381 if (RootSizeInBits < 512 &&
40382 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40383 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40384 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40385 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40386 // Bail if this was already a truncation or PACK node.
40387 // We sometimes fail to match PACK if we demand known undef elements.
40388 if (Depth == 0 &&
40389 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40390 RootOpc == X86ISD::PACKUS))
40391 return SDValue(); // Nothing to do!
40392 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40393 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40394 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40395 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40396 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40397 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40398 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40399 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40400 return DAG.getBitcast(RootVT, Res);
40401 }
40402 }
40403
40404 // Don't try to re-form single instruction chains under any circumstances now
40405 // that we've done encoding canonicalization for them.
40406 if (Depth < 1)
40407 return SDValue();
40408
40409 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40410 return isTargetShuffleVariableMask(N->getOpcode());
40411 });
40412 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40413 return (N->getOpcode() == X86ISD::VPERMV3 ||
40414 N->getOpcode() == X86ISD::VPERMV);
40415 });
40416
40417 // Depth threshold above which we can efficiently use variable mask shuffles.
40418 int VariableCrossLaneShuffleDepth =
40419 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40420 int VariablePerLaneShuffleDepth =
40421 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40422 AllowVariableCrossLaneMask &=
40423 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40424 AllowVariablePerLaneMask &=
40425 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40426 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40427 // higher depth before combining them.
40428 int BWIVPERMV3ShuffleDepth =
40429 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40430 bool AllowBWIVPERMV3 =
40431 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40432
40433 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40434 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40435 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40436
40437 bool MaskContainsZeros = isAnyZero(Mask);
40438
40439 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40440 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40441 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40442 if (Subtarget.hasAVX2() &&
40443 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40444 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40445 Res = CanonicalizeShuffleInput(MaskVT, V1);
40446 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40447 return DAG.getBitcast(RootVT, Res);
40448 }
40449 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40450 if ((Subtarget.hasAVX512() &&
40451 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40452 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40453 (Subtarget.hasBWI() &&
40454 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40455 (Subtarget.hasVBMI() &&
40456 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40457 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40458 V2 = DAG.getUNDEF(MaskVT);
40459 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40460 return DAG.getBitcast(RootVT, Res);
40461 }
40462 }
40463
40464 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40465 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40466 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40467 ((Subtarget.hasAVX512() &&
40468 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40469 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40470 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40471 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40472 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40473 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40474 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40475 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40476 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40477 for (unsigned i = 0; i != NumMaskElts; ++i)
40478 if (Mask[i] == SM_SentinelZero)
40479 Mask[i] = NumMaskElts + i;
40480 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40481 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40482 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40483 return DAG.getBitcast(RootVT, Res);
40484 }
40485
40486 // If that failed and either input is extracted then try to combine as a
40487 // shuffle with the larger type.
40489 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40490 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40491 IsMaskedShuffle, DAG, DL, Subtarget))
40492 return WideShuffle;
40493
40494 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40495 // (non-VLX will pad to 512-bit shuffles).
40496 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40497 ((Subtarget.hasAVX512() &&
40498 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40499 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40500 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40501 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40502 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40503 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40504 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40505 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40506 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40507 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40508 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40509 return DAG.getBitcast(RootVT, Res);
40510 }
40511 return SDValue();
40512 }
40513
40514 // See if we can combine a single input shuffle with zeros to a bit-mask,
40515 // which is much simpler than any shuffle.
40516 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40517 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40518 TLI.isTypeLegal(MaskVT)) {
40519 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40520 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40521 APInt UndefElts(NumMaskElts, 0);
40522 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40523 for (unsigned i = 0; i != NumMaskElts; ++i) {
40524 int M = Mask[i];
40525 if (M == SM_SentinelUndef) {
40526 UndefElts.setBit(i);
40527 continue;
40528 }
40529 if (M == SM_SentinelZero)
40530 continue;
40531 EltBits[i] = AllOnes;
40532 }
40533 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40534 Res = CanonicalizeShuffleInput(MaskVT, V1);
40535 unsigned AndOpcode =
40537 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40538 return DAG.getBitcast(RootVT, Res);
40539 }
40540
40541 // If we have a single input shuffle with different shuffle patterns in the
40542 // the 128-bit lanes use the variable mask to VPERMILPS.
40543 // TODO Combine other mask types at higher depths.
40544 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40545 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40546 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40547 SmallVector<SDValue, 16> VPermIdx;
40548 for (int M : Mask) {
40549 SDValue Idx =
40550 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40551 VPermIdx.push_back(Idx);
40552 }
40553 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40554 Res = CanonicalizeShuffleInput(MaskVT, V1);
40555 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40556 return DAG.getBitcast(RootVT, Res);
40557 }
40558
40559 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40560 // to VPERMIL2PD/VPERMIL2PS.
40561 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40562 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40563 MaskVT == MVT::v8f32)) {
40564 // VPERMIL2 Operation.
40565 // Bits[3] - Match Bit.
40566 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40567 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40568 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40569 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40570 SmallVector<int, 8> VPerm2Idx;
40571 unsigned M2ZImm = 0;
40572 for (int M : Mask) {
40573 if (M == SM_SentinelUndef) {
40574 VPerm2Idx.push_back(-1);
40575 continue;
40576 }
40577 if (M == SM_SentinelZero) {
40578 M2ZImm = 2;
40579 VPerm2Idx.push_back(8);
40580 continue;
40581 }
40582 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40583 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40584 VPerm2Idx.push_back(Index);
40585 }
40586 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40587 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40588 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40589 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40590 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40591 return DAG.getBitcast(RootVT, Res);
40592 }
40593
40594 // If we have 3 or more shuffle instructions or a chain involving a variable
40595 // mask, we can replace them with a single PSHUFB instruction profitably.
40596 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40597 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40598 // more aggressive.
40599 if (UnaryShuffle && AllowVariablePerLaneMask &&
40600 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40601 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40602 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40603 SmallVector<SDValue, 16> PSHUFBMask;
40604 int NumBytes = RootVT.getSizeInBits() / 8;
40605 int Ratio = NumBytes / NumMaskElts;
40606 for (int i = 0; i < NumBytes; ++i) {
40607 int M = Mask[i / Ratio];
40608 if (M == SM_SentinelUndef) {
40609 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40610 continue;
40611 }
40612 if (M == SM_SentinelZero) {
40613 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40614 continue;
40615 }
40616 M = Ratio * M + i % Ratio;
40617 assert((M / 16) == (i / 16) && "Lane crossing detected");
40618 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40619 }
40620 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40621 Res = CanonicalizeShuffleInput(ByteVT, V1);
40622 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40623 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40624 return DAG.getBitcast(RootVT, Res);
40625 }
40626
40627 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40628 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40629 // slower than PSHUFB on targets that support both.
40630 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40631 Subtarget.hasXOP()) {
40632 // VPPERM Mask Operation
40633 // Bits[4:0] - Byte Index (0 - 31)
40634 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40635 SmallVector<SDValue, 16> VPPERMMask;
40636 int NumBytes = 16;
40637 int Ratio = NumBytes / NumMaskElts;
40638 for (int i = 0; i < NumBytes; ++i) {
40639 int M = Mask[i / Ratio];
40640 if (M == SM_SentinelUndef) {
40641 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40642 continue;
40643 }
40644 if (M == SM_SentinelZero) {
40645 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40646 continue;
40647 }
40648 M = Ratio * M + i % Ratio;
40649 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40650 }
40651 MVT ByteVT = MVT::v16i8;
40652 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40653 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40654 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40655 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40656 return DAG.getBitcast(RootVT, Res);
40657 }
40658
40659 // If that failed and either input is extracted then try to combine as a
40660 // shuffle with the larger type.
40662 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40663 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40664 DAG, DL, Subtarget))
40665 return WideShuffle;
40666
40667 // If we have a dual input shuffle then lower to VPERMV3,
40668 // (non-VLX will pad to 512-bit shuffles)
40669 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40670 ((Subtarget.hasAVX512() &&
40671 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40672 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40673 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40674 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40675 MaskVT == MVT::v16i32)) ||
40676 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40677 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40678 MaskVT == MVT::v32i16)) ||
40679 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40680 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40681 MaskVT == MVT::v64i8)))) {
40682 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40683 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40684 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40685 return DAG.getBitcast(RootVT, Res);
40686 }
40687
40688 // Failed to find any combines.
40689 return SDValue();
40690}
40691
40692// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40693// instruction if possible.
40694//
40695// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40696// type size to attempt to combine:
40697// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40698// -->
40699// extract_subvector(shuffle(x,y,m2),0)
40701 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40702 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40703 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40704 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40705 const X86Subtarget &Subtarget) {
40706 unsigned NumMaskElts = BaseMask.size();
40707 unsigned NumInputs = Inputs.size();
40708 if (NumInputs == 0)
40709 return SDValue();
40710
40711 unsigned RootSizeInBits = RootVT.getSizeInBits();
40712 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40713 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40714
40715 // Peek through subvectors to find widest legal vector.
40716 // TODO: Handle ISD::TRUNCATE
40717 unsigned WideSizeInBits = RootSizeInBits;
40718 for (SDValue Input : Inputs) {
40719 Input = peekThroughBitcasts(Input);
40720 while (1) {
40721 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40722 Input = peekThroughBitcasts(Input.getOperand(0));
40723 continue;
40724 }
40725 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40726 Input.getOperand(0).isUndef() &&
40727 isNullConstant(Input.getOperand(2))) {
40728 Input = peekThroughBitcasts(Input.getOperand(1));
40729 continue;
40730 }
40731 break;
40732 }
40733 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40734 WideSizeInBits < Input.getValueSizeInBits())
40735 WideSizeInBits = Input.getValueSizeInBits();
40736 }
40737
40738 // Bail if we fail to find a source larger than the existing root.
40739 if (WideSizeInBits <= RootSizeInBits ||
40740 (WideSizeInBits % RootSizeInBits) != 0)
40741 return SDValue();
40742
40743 // Create new mask for larger type.
40744 SmallVector<int, 64> WideMask;
40745 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40746
40747 // Attempt to peek through inputs and adjust mask when we extract from an
40748 // upper subvector.
40749 int AdjustedMasks = 0;
40750 SmallVector<SDValue, 4> WideInputs(Inputs);
40751 for (unsigned I = 0; I != NumInputs; ++I) {
40752 SDValue &Input = WideInputs[I];
40753 Input = peekThroughBitcasts(Input);
40754 while (1) {
40755 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40756 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40758 if (Idx != 0) {
40759 ++AdjustedMasks;
40760 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40761 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40762
40763 int lo = I * WideMask.size();
40764 int hi = (I + 1) * WideMask.size();
40765 for (int &M : WideMask)
40766 if (lo <= M && M < hi)
40767 M += Idx;
40768 }
40769 Input = peekThroughBitcasts(Input.getOperand(0));
40770 continue;
40771 }
40772 // TODO: Handle insertions into upper subvectors.
40773 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40774 Input.getOperand(0).isUndef() &&
40775 isNullConstant(Input.getOperand(2))) {
40776 Input = peekThroughBitcasts(Input.getOperand(1));
40777 continue;
40778 }
40779 break;
40780 }
40781 }
40782
40783 // Remove unused/repeated shuffle source ops.
40784 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40785 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40786
40787 // Bail if we're always extracting from the lowest subvectors,
40788 // combineX86ShuffleChain should match this for the current width, or the
40789 // shuffle still references too many inputs.
40790 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40791 return SDValue();
40792
40793 // Minor canonicalization of the accumulated shuffle mask to make it easier
40794 // to match below. All this does is detect masks with sequential pairs of
40795 // elements, and shrink them to the half-width mask. It does this in a loop
40796 // so it will reduce the size of the mask to the minimal width mask which
40797 // performs an equivalent shuffle.
40798 while (WideMask.size() > 1) {
40799 SmallVector<int, 64> WidenedMask;
40800 if (!canWidenShuffleElements(WideMask, WidenedMask))
40801 break;
40802 WideMask = std::move(WidenedMask);
40803 }
40804
40805 // Canonicalization of binary shuffle masks to improve pattern matching by
40806 // commuting the inputs.
40807 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40809 std::swap(WideInputs[0], WideInputs[1]);
40810 }
40811
40812 // Increase depth for every upper subvector we've peeked through.
40813 Depth += AdjustedMasks;
40814
40815 // Attempt to combine wider chain.
40816 // TODO: Can we use a better Root?
40817 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40818 WideInputs.back().getValueSizeInBits()
40819 ? WideInputs.front()
40820 : WideInputs.back();
40821 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40822 "WideRootSize mismatch");
40823
40824 if (SDValue WideShuffle = combineX86ShuffleChain(
40825 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40826 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40827 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40828 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40829 return DAG.getBitcast(RootVT, WideShuffle);
40830 }
40831
40832 return SDValue();
40833}
40834
40835// Canonicalize the combined shuffle mask chain with horizontal ops.
40836// NOTE: This may update the Ops and Mask.
40839 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40840 const X86Subtarget &Subtarget) {
40841 if (Mask.empty() || Ops.empty())
40842 return SDValue();
40843
40845 for (SDValue Op : Ops)
40847
40848 // All ops must be the same horizop + type.
40849 SDValue BC0 = BC[0];
40850 EVT VT0 = BC0.getValueType();
40851 unsigned Opcode0 = BC0.getOpcode();
40852 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40853 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40854 }))
40855 return SDValue();
40856
40857 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40858 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40859 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40860 if (!isHoriz && !isPack)
40861 return SDValue();
40862
40863 // Do all ops have a single use?
40864 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40865 return Op.hasOneUse() &&
40867 });
40868
40869 int NumElts = VT0.getVectorNumElements();
40870 int NumLanes = VT0.getSizeInBits() / 128;
40871 int NumEltsPerLane = NumElts / NumLanes;
40872 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40873 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40874 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40875
40876 if (NumEltsPerLane >= 4 &&
40877 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40878 SmallVector<int> LaneMask, ScaledMask;
40879 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40880 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40881 // See if we can remove the shuffle by resorting the HOP chain so that
40882 // the HOP args are pre-shuffled.
40883 // TODO: Generalize to any sized/depth chain.
40884 // TODO: Add support for PACKSS/PACKUS.
40885 if (isHoriz) {
40886 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40887 auto GetHOpSrc = [&](int M) {
40888 if (M == SM_SentinelUndef)
40889 return DAG.getUNDEF(VT0);
40890 if (M == SM_SentinelZero)
40891 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40892 SDValue Src0 = BC[M / 4];
40893 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40894 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40895 return Src1.getOperand(M % 2);
40896 return SDValue();
40897 };
40898 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40899 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40900 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40901 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40902 if (M0 && M1 && M2 && M3) {
40903 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40904 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40905 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40906 }
40907 }
40908 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40909 if (Ops.size() >= 2) {
40910 SDValue LHS, RHS;
40911 auto GetHOpSrc = [&](int M, int &OutM) {
40912 // TODO: Support SM_SentinelZero
40913 if (M < 0)
40914 return M == SM_SentinelUndef;
40915 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40916 if (!LHS || LHS == Src) {
40917 LHS = Src;
40918 OutM = (M % 2);
40919 return true;
40920 }
40921 if (!RHS || RHS == Src) {
40922 RHS = Src;
40923 OutM = (M % 2) + 2;
40924 return true;
40925 }
40926 return false;
40927 };
40928 int PostMask[4] = {-1, -1, -1, -1};
40929 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40930 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40931 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40932 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40933 LHS = DAG.getBitcast(SrcVT, LHS);
40934 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40935 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40936 // Use SHUFPS for the permute so this will work on SSE2 targets,
40937 // shuffle combining and domain handling will simplify this later on.
40938 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40939 Res = DAG.getBitcast(ShuffleVT, Res);
40940 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40941 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40942 }
40943 }
40944 }
40945 }
40946
40947 if (2 < Ops.size())
40948 return SDValue();
40949
40950 SDValue BC1 = BC[BC.size() - 1];
40951 if (Mask.size() == VT0.getVectorNumElements()) {
40952 // Canonicalize binary shuffles of horizontal ops that use the
40953 // same sources to an unary shuffle.
40954 // TODO: Try to perform this fold even if the shuffle remains.
40955 if (Ops.size() == 2) {
40956 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40957 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40958 };
40959 // Commute if all BC0's ops are contained in BC1.
40960 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40961 ContainsOps(BC1, BC0.getOperand(1))) {
40963 std::swap(Ops[0], Ops[1]);
40964 std::swap(BC0, BC1);
40965 }
40966
40967 // If BC1 can be represented by BC0, then convert to unary shuffle.
40968 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40969 ContainsOps(BC0, BC1.getOperand(1))) {
40970 for (int &M : Mask) {
40971 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40972 continue;
40973 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40974 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40975 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40976 M += NumHalfEltsPerLane;
40977 }
40978 }
40979 }
40980
40981 // Canonicalize unary horizontal ops to only refer to lower halves.
40982 for (int i = 0; i != NumElts; ++i) {
40983 int &M = Mask[i];
40984 if (isUndefOrZero(M))
40985 continue;
40986 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40987 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40988 M -= NumHalfEltsPerLane;
40989 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40990 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40991 M -= NumHalfEltsPerLane;
40992 }
40993 }
40994
40995 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40996 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40997 // represents the LHS/RHS inputs for the lower/upper halves.
40998 SmallVector<int, 16> TargetMask128, WideMask128;
40999 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41000 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41001 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41002 bool SingleOp = (Ops.size() == 1);
41003 if (isPack || OneUseOps ||
41004 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41005 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41006 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41007 Lo = Lo.getOperand(WideMask128[0] & 1);
41008 Hi = Hi.getOperand(WideMask128[1] & 1);
41009 if (SingleOp) {
41010 SDValue Undef = DAG.getUNDEF(SrcVT);
41011 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41012 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41013 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41014 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41015 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41016 }
41017 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41018 }
41019 }
41020
41021 // If we are post-shuffling a 256-bit hop and not requiring the upper
41022 // elements, then try to narrow to a 128-bit hop directly.
41023 SmallVector<int, 16> WideMask64;
41024 if (Ops.size() == 1 && NumLanes == 2 &&
41025 scaleShuffleElements(Mask, 4, WideMask64) &&
41026 isUndefInRange(WideMask64, 2, 2)) {
41027 int M0 = WideMask64[0];
41028 int M1 = WideMask64[1];
41029 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41031 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41032 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41033 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41034 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41035 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41036 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41037 }
41038 }
41039
41040 return SDValue();
41041}
41042
41043// Attempt to constant fold all of the constant source ops.
41044// Returns true if the entire shuffle is folded to a constant.
41045// TODO: Extend this to merge multiple constant Ops and update the mask.
41047 ArrayRef<int> Mask,
41048 ArrayRef<const SDNode *> SrcNodes,
41049 SelectionDAG &DAG, const SDLoc &DL,
41050 const X86Subtarget &Subtarget) {
41051 unsigned SizeInBits = VT.getSizeInBits();
41052 unsigned NumMaskElts = Mask.size();
41053 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41054 unsigned NumOps = Ops.size();
41055
41056 // Extract constant bits from each source op.
41057 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41058 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
41059 for (unsigned I = 0; I != NumOps; ++I)
41060 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41061 RawBitsOps[I],
41062 /*AllowWholeUndefs*/ true,
41063 /*AllowPartialUndefs*/ true))
41064 return SDValue();
41065
41066 // If we're optimizing for size, only fold if at least one of the constants is
41067 // only used once or the combined shuffle has included a variable mask
41068 // shuffle, this is to avoid constant pool bloat.
41069 bool IsOptimizingSize = DAG.shouldOptForSize();
41070 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41071 return isTargetShuffleVariableMask(N->getOpcode());
41072 });
41073 if (IsOptimizingSize && !HasVariableMask &&
41074 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41075 return SDValue();
41076
41077 // Shuffle the constant bits according to the mask.
41078 APInt UndefElts(NumMaskElts, 0);
41079 APInt ZeroElts(NumMaskElts, 0);
41080 APInt ConstantElts(NumMaskElts, 0);
41081 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41082 APInt::getZero(MaskSizeInBits));
41083 for (unsigned i = 0; i != NumMaskElts; ++i) {
41084 int M = Mask[i];
41085 if (M == SM_SentinelUndef) {
41086 UndefElts.setBit(i);
41087 continue;
41088 } else if (M == SM_SentinelZero) {
41089 ZeroElts.setBit(i);
41090 continue;
41091 }
41092 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41093
41094 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41095 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41096
41097 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41098 if (SrcUndefElts[SrcMaskIdx]) {
41099 UndefElts.setBit(i);
41100 continue;
41101 }
41102
41103 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41104 APInt &Bits = SrcEltBits[SrcMaskIdx];
41105 if (!Bits) {
41106 ZeroElts.setBit(i);
41107 continue;
41108 }
41109
41110 ConstantElts.setBit(i);
41111 ConstantBitData[i] = Bits;
41112 }
41113 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41114
41115 // Attempt to create a zero vector.
41116 if ((UndefElts | ZeroElts).isAllOnes())
41117 return getZeroVector(VT, Subtarget, DAG, DL);
41118
41119 // Create the constant data.
41120 MVT MaskSVT;
41121 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41122 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41123 else
41124 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41125
41126 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41127 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41128 return SDValue();
41129
41130 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41131 return DAG.getBitcast(VT, CstOp);
41132}
41133
41134namespace llvm {
41135 namespace X86 {
41136 enum {
41139 } // namespace X86
41140} // namespace llvm
41141
41142/// Fully generic combining of x86 shuffle instructions.
41143///
41144/// This should be the last combine run over the x86 shuffle instructions. Once
41145/// they have been fully optimized, this will recursively consider all chains
41146/// of single-use shuffle instructions, build a generic model of the cumulative
41147/// shuffle operation, and check for simpler instructions which implement this
41148/// operation. We use this primarily for two purposes:
41149///
41150/// 1) Collapse generic shuffles to specialized single instructions when
41151/// equivalent. In most cases, this is just an encoding size win, but
41152/// sometimes we will collapse multiple generic shuffles into a single
41153/// special-purpose shuffle.
41154/// 2) Look for sequences of shuffle instructions with 3 or more total
41155/// instructions, and replace them with the slightly more expensive SSSE3
41156/// PSHUFB instruction if available. We do this as the last combining step
41157/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41158/// a suitable short sequence of other instructions. The PSHUFB will either
41159/// use a register or have to read from memory and so is slightly (but only
41160/// slightly) more expensive than the other shuffle instructions.
41161///
41162/// Because this is inherently a quadratic operation (for each shuffle in
41163/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41164/// This should never be an issue in practice as the shuffle lowering doesn't
41165/// produce sequences of more than 8 instructions.
41166///
41167/// FIXME: We will currently miss some cases where the redundant shuffling
41168/// would simplify under the threshold for PSHUFB formation because of
41169/// combine-ordering. To fix this, we should do the redundant instruction
41170/// combining in this recursive walk.
41172 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41173 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41174 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41175 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41176 const SDLoc &DL, const X86Subtarget &Subtarget) {
41177 assert(!RootMask.empty() &&
41178 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41179 "Illegal shuffle root mask");
41180 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41181 unsigned RootSizeInBits = RootVT.getSizeInBits();
41182 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41183
41184 // Bound the depth of our recursive combine because this is ultimately
41185 // quadratic in nature.
41186 if (Depth >= MaxDepth)
41187 return SDValue();
41188
41189 // Directly rip through bitcasts to find the underlying operand.
41190 SDValue Op = SrcOps[SrcOpIndex];
41192
41193 EVT VT = Op.getValueType();
41194 if (!VT.isVector() || !VT.isSimple())
41195 return SDValue(); // Bail if we hit a non-simple non-vector.
41196
41197 // FIXME: Just bail on f16 for now.
41198 if (VT.getVectorElementType() == MVT::f16)
41199 return SDValue();
41200
41201 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41202 "Can only combine shuffles upto size of the root op.");
41203
41204 // Create a demanded elts mask from the referenced elements of Op.
41205 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41206 for (int M : RootMask) {
41207 int BaseIdx = RootMask.size() * SrcOpIndex;
41208 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41209 OpDemandedElts.setBit(M - BaseIdx);
41210 }
41211 if (RootSizeInBits != VT.getSizeInBits()) {
41212 // Op is smaller than Root - extract the demanded elts for the subvector.
41213 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41214 unsigned NumOpMaskElts = RootMask.size() / Scale;
41215 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41216 assert(OpDemandedElts
41217 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41218 .isZero() &&
41219 "Out of range elements referenced in root mask");
41220 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41221 }
41222 OpDemandedElts =
41223 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41224
41225 // Extract target shuffle mask and resolve sentinels and inputs.
41226 SmallVector<int, 64> OpMask;
41227 SmallVector<SDValue, 2> OpInputs;
41228 APInt OpUndef, OpZero;
41229 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41230 OpZero, DAG, Depth, false)) {
41231 // Shuffle inputs must not be larger than the shuffle result.
41232 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41233 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41234 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41235 }))
41236 return SDValue();
41237 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41238 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41239 !isNullConstant(Op.getOperand(1))) {
41240 SDValue SrcVec = Op.getOperand(0);
41241 int ExtractIdx = Op.getConstantOperandVal(1);
41242 unsigned NumElts = VT.getVectorNumElements();
41243 OpInputs.assign({SrcVec});
41244 OpMask.assign(NumElts, SM_SentinelUndef);
41245 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41246 OpZero = OpUndef = APInt::getZero(NumElts);
41247 } else {
41248 return SDValue();
41249 }
41250
41251 // If the shuffle result was smaller than the root, we need to adjust the
41252 // mask indices and pad the mask with undefs.
41253 if (RootSizeInBits > VT.getSizeInBits()) {
41254 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41255 unsigned OpMaskSize = OpMask.size();
41256 if (OpInputs.size() > 1) {
41257 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41258 for (int &M : OpMask) {
41259 if (M < 0)
41260 continue;
41261 int EltIdx = M % OpMaskSize;
41262 int OpIdx = M / OpMaskSize;
41263 M = (PaddedMaskSize * OpIdx) + EltIdx;
41264 }
41265 }
41266 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41267 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41268 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41269 }
41270
41273
41274 // We don't need to merge masks if the root is empty.
41275 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41276 if (EmptyRoot) {
41277 // Only resolve zeros if it will remove an input, otherwise we might end
41278 // up in an infinite loop.
41279 bool ResolveKnownZeros = true;
41280 if (!OpZero.isZero()) {
41281 APInt UsedInputs = APInt::getZero(OpInputs.size());
41282 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41283 int M = OpMask[i];
41284 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41285 continue;
41286 UsedInputs.setBit(M / OpMask.size());
41287 if (UsedInputs.isAllOnes()) {
41288 ResolveKnownZeros = false;
41289 break;
41290 }
41291 }
41292 }
41293 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41294 ResolveKnownZeros);
41295
41296 Mask = OpMask;
41297 Ops.append(OpInputs.begin(), OpInputs.end());
41298 } else {
41299 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41300
41301 // Add the inputs to the Ops list, avoiding duplicates.
41302 Ops.append(SrcOps.begin(), SrcOps.end());
41303
41304 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41305 // Attempt to find an existing match.
41306 SDValue InputBC = peekThroughBitcasts(Input);
41307 for (int i = 0, e = Ops.size(); i < e; ++i)
41308 if (InputBC == peekThroughBitcasts(Ops[i]))
41309 return i;
41310 // Match failed - should we replace an existing Op?
41311 if (InsertionPoint >= 0) {
41312 Ops[InsertionPoint] = Input;
41313 return InsertionPoint;
41314 }
41315 // Add to the end of the Ops list.
41316 Ops.push_back(Input);
41317 return Ops.size() - 1;
41318 };
41319
41320 SmallVector<int, 2> OpInputIdx;
41321 for (SDValue OpInput : OpInputs)
41322 OpInputIdx.push_back(
41323 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41324
41325 assert(((RootMask.size() > OpMask.size() &&
41326 RootMask.size() % OpMask.size() == 0) ||
41327 (OpMask.size() > RootMask.size() &&
41328 OpMask.size() % RootMask.size() == 0) ||
41329 OpMask.size() == RootMask.size()) &&
41330 "The smaller number of elements must divide the larger.");
41331
41332 // This function can be performance-critical, so we rely on the power-of-2
41333 // knowledge that we have about the mask sizes to replace div/rem ops with
41334 // bit-masks and shifts.
41335 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
41336 "Non-power-of-2 shuffle mask sizes");
41337 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
41338 "Non-power-of-2 shuffle mask sizes");
41339 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41340 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41341
41342 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41343 unsigned RootRatio =
41344 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41345 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41346 assert((RootRatio == 1 || OpRatio == 1) &&
41347 "Must not have a ratio for both incoming and op masks!");
41348
41349 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41350 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41351 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41352 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41353 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41354
41355 Mask.resize(MaskWidth, SM_SentinelUndef);
41356
41357 // Merge this shuffle operation's mask into our accumulated mask. Note that
41358 // this shuffle's mask will be the first applied to the input, followed by
41359 // the root mask to get us all the way to the root value arrangement. The
41360 // reason for this order is that we are recursing up the operation chain.
41361 for (unsigned i = 0; i < MaskWidth; ++i) {
41362 unsigned RootIdx = i >> RootRatioLog2;
41363 if (RootMask[RootIdx] < 0) {
41364 // This is a zero or undef lane, we're done.
41365 Mask[i] = RootMask[RootIdx];
41366 continue;
41367 }
41368
41369 unsigned RootMaskedIdx =
41370 RootRatio == 1
41371 ? RootMask[RootIdx]
41372 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41373
41374 // Just insert the scaled root mask value if it references an input other
41375 // than the SrcOp we're currently inserting.
41376 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41377 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41378 Mask[i] = RootMaskedIdx;
41379 continue;
41380 }
41381
41382 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41383 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41384 if (OpMask[OpIdx] < 0) {
41385 // The incoming lanes are zero or undef, it doesn't matter which ones we
41386 // are using.
41387 Mask[i] = OpMask[OpIdx];
41388 continue;
41389 }
41390
41391 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41392 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41393 : (OpMask[OpIdx] << OpRatioLog2) +
41394 (RootMaskedIdx & (OpRatio - 1));
41395
41396 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41397 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41398 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41399 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41400
41401 Mask[i] = OpMaskedIdx;
41402 }
41403 }
41404
41405 // Peek through any free bitcasts to insert_subvector vector widenings or
41406 // extract_subvector nodes back to root size.
41407 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41408 for (auto [I, Op] : enumerate(Ops)) {
41409 SDValue BC = Op;
41410 while (1) {
41411 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41412 BC = BC.getOperand(0);
41413 continue;
41414 }
41415 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41416 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41417 // Set out of bounds mask indices to undef.
41418 Op = BC = BC.getOperand(1);
41419 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41420 int Lo = I * Mask.size();
41421 int Hi = (I + 1) * Mask.size();
41422 int NewHi = Lo + (Mask.size() / Scale);
41423 for (int &M : Mask) {
41424 if (Lo <= M && NewHi <= M && M < Hi)
41425 M = SM_SentinelUndef;
41426 }
41427 continue;
41428 }
41429 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41430 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41431 isNullConstant(BC.getOperand(1))) {
41432 Op = BC = BC.getOperand(0);
41433 continue;
41434 }
41435 break;
41436 }
41437 }
41438
41439 // Remove unused/repeated shuffle source ops.
41441
41442 // Handle the all undef/zero/ones cases early.
41443 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41444 return DAG.getUNDEF(RootVT);
41445 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41446 return getZeroVector(RootVT, Subtarget, DAG, DL);
41447 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41449 return getOnesVector(RootVT, DAG, DL);
41450
41451 assert(!Ops.empty() && "Shuffle with no inputs detected");
41452
41453 // Update the list of shuffle nodes that have been combined so far.
41454 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41455 CombinedNodes.push_back(Op.getNode());
41456
41457 // See if we can recurse into each shuffle source op (if it's a target
41458 // shuffle). The source op should only be generally combined if it either has
41459 // a single use (i.e. current Op) or all its users have already been combined,
41460 // if not then we can still combine but should prevent generation of variable
41461 // shuffles to avoid constant pool bloat.
41462 // Don't recurse if we already have more source ops than we can combine in
41463 // the remaining recursion depth.
41464 if (Ops.size() < (MaxDepth - Depth)) {
41465 for (int i = 0, e = Ops.size(); i < e; ++i) {
41466 // For empty roots, we need to resolve zeroable elements before combining
41467 // them with other shuffles.
41468 SmallVector<int, 64> ResolvedMask = Mask;
41469 if (EmptyRoot)
41470 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41471 bool AllowCrossLaneVar = false;
41472 bool AllowPerLaneVar = false;
41473 if (Ops[i].getNode()->hasOneUse() ||
41474 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41475 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41476 AllowPerLaneVar = AllowVariablePerLaneMask;
41477 }
41479 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41480 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41481 DAG, DL, Subtarget))
41482 return Res;
41483 }
41484 }
41485
41486 // Attempt to constant fold all of the constant source ops.
41488 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41489 return Cst;
41490
41491 // If constant fold failed and we only have constants - then we have
41492 // multiple uses by a single non-variable shuffle - just bail.
41493 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41494 APInt UndefElts;
41495 SmallVector<APInt> RawBits;
41496 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41497 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41498 RawBits,
41499 /*AllowWholeUndefs*/ true,
41500 /*AllowPartialUndefs*/ true);
41501 })) {
41502 return SDValue();
41503 }
41504
41505 // Canonicalize the combined shuffle mask chain with horizontal ops.
41506 // NOTE: This will update the Ops and Mask.
41508 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41509 return DAG.getBitcast(RootVT, HOp);
41510
41511 // Try to refine our inputs given our knowledge of target shuffle mask.
41512 for (auto I : enumerate(Ops)) {
41513 int OpIdx = I.index();
41514 SDValue &Op = I.value();
41515
41516 // What range of shuffle mask element values results in picking from Op?
41517 int Lo = OpIdx * Mask.size();
41518 int Hi = Lo + Mask.size();
41519
41520 // Which elements of Op do we demand, given the mask's granularity?
41521 APInt OpDemandedElts(Mask.size(), 0);
41522 for (int MaskElt : Mask) {
41523 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41524 int OpEltIdx = MaskElt - Lo;
41525 OpDemandedElts.setBit(OpEltIdx);
41526 }
41527 }
41528
41529 // Is the shuffle result smaller than the root?
41530 if (Op.getValueSizeInBits() < RootSizeInBits) {
41531 // We padded the mask with undefs. But we now need to undo that.
41532 unsigned NumExpectedVectorElts = Mask.size();
41533 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41534 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41535 assert(!OpDemandedElts.extractBits(
41536 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41537 "Demanding the virtual undef widening padding?");
41538 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41539 }
41540
41541 // The Op itself may be of different VT, so we need to scale the mask.
41542 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41543 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41544
41545 // Can this operand be simplified any further, given it's demanded elements?
41547 Op, OpScaledDemandedElts, DAG))
41548 Op = NewOp;
41549 }
41550 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41551
41552 // Widen any subvector shuffle inputs we've collected.
41553 // TODO: Remove this to avoid generating temporary nodes, we should only
41554 // widen once combineX86ShuffleChain has found a match.
41555 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41556 return Op.getValueSizeInBits() < RootSizeInBits;
41557 })) {
41558 for (SDValue &Op : Ops)
41559 if (Op.getValueSizeInBits() < RootSizeInBits)
41560 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41561 RootSizeInBits);
41562 // Reresolve - we might have repeated subvector sources.
41564 }
41565
41566 // We can only combine unary and binary shuffle mask cases.
41567 if (Ops.size() <= 2) {
41568 // Minor canonicalization of the accumulated shuffle mask to make it easier
41569 // to match below. All this does is detect masks with sequential pairs of
41570 // elements, and shrink them to the half-width mask. It does this in a loop
41571 // so it will reduce the size of the mask to the minimal width mask which
41572 // performs an equivalent shuffle.
41573 while (Mask.size() > 1) {
41574 SmallVector<int, 64> WidenedMask;
41575 if (!canWidenShuffleElements(Mask, WidenedMask))
41576 break;
41577 Mask = std::move(WidenedMask);
41578 }
41579
41580 // Canonicalization of binary shuffle masks to improve pattern matching by
41581 // commuting the inputs.
41582 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41584 std::swap(Ops[0], Ops[1]);
41585 }
41586
41587 // Try to combine into a single shuffle instruction.
41588 if (SDValue Shuffle = combineX86ShuffleChain(
41589 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41590 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41591 IsMaskedShuffle, DAG, DL, Subtarget))
41592 return Shuffle;
41593
41594 // If all the operands come from the same larger vector, fallthrough and try
41595 // to use combineX86ShuffleChainWithExtract.
41598 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41599 (RootSizeInBits / Mask.size()) != 64 ||
41600 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41601 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41602 LHS.getOperand(0) != RHS.getOperand(0))
41603 return SDValue();
41604 }
41605
41606 // If that failed and any input is extracted then try to combine as a
41607 // shuffle with the larger type.
41609 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41610 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41611 DAG, DL, Subtarget);
41612}
41613
41614/// Helper entry wrapper to combineX86ShufflesRecursively.
41616 const X86Subtarget &Subtarget) {
41618 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41619 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41620 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41621 SDLoc(Op), Subtarget);
41622}
41623
41624/// Get the PSHUF-style mask from PSHUF node.
41625///
41626/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41627/// PSHUF-style masks that can be reused with such instructions.
41629 MVT VT = N.getSimpleValueType();
41632 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41633 (void)HaveMask;
41634 assert(HaveMask);
41635
41636 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41637 // matter. Check that the upper masks are repeats and remove them.
41638 if (VT.getSizeInBits() > 128) {
41639 int LaneElts = 128 / VT.getScalarSizeInBits();
41640#ifndef NDEBUG
41641 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41642 for (int j = 0; j < LaneElts; ++j)
41643 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41644 "Mask doesn't repeat in high 128-bit lanes!");
41645#endif
41646 Mask.resize(LaneElts);
41647 }
41648
41649 switch (N.getOpcode()) {
41650 case X86ISD::PSHUFD:
41651 return Mask;
41652 case X86ISD::PSHUFLW:
41653 Mask.resize(4);
41654 return Mask;
41655 case X86ISD::PSHUFHW:
41656 Mask.erase(Mask.begin(), Mask.begin() + 4);
41657 for (int &M : Mask)
41658 M -= 4;
41659 return Mask;
41660 default:
41661 llvm_unreachable("No valid shuffle instruction found!");
41662 }
41663}
41664
41665/// Get the expanded blend mask from a BLENDI node.
41666/// For v16i16 nodes, this will splat the repeated i8 mask.
41668 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41669 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41670 APInt Mask = V.getConstantOperandAPInt(2);
41671 if (Mask.getBitWidth() > NumElts)
41672 Mask = Mask.trunc(NumElts);
41673 if (NumElts == 16) {
41674 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41675 Mask = APInt::getSplat(16, Mask);
41676 }
41677 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41678 return Mask;
41679}
41680
41681/// Search for a combinable shuffle across a chain ending in pshufd.
41682///
41683/// We walk up the chain and look for a combinable shuffle, skipping over
41684/// shuffles that we could hoist this shuffle's transformation past without
41685/// altering anything.
41688 const SDLoc &DL,
41689 SelectionDAG &DAG) {
41690 assert(N.getOpcode() == X86ISD::PSHUFD &&
41691 "Called with something other than an x86 128-bit half shuffle!");
41692
41693 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41694 // of the shuffles in the chain so that we can form a fresh chain to replace
41695 // this one.
41697 SDValue V = N.getOperand(0);
41698 for (; V.hasOneUse(); V = V.getOperand(0)) {
41699 switch (V.getOpcode()) {
41700 default:
41701 return SDValue(); // Nothing combined!
41702
41703 case ISD::BITCAST:
41704 // Skip bitcasts as we always know the type for the target specific
41705 // instructions.
41706 continue;
41707
41708 case X86ISD::PSHUFD:
41709 // Found another dword shuffle.
41710 break;
41711
41712 case X86ISD::PSHUFLW:
41713 // Check that the low words (being shuffled) are the identity in the
41714 // dword shuffle, and the high words are self-contained.
41715 if (Mask[0] != 0 || Mask[1] != 1 ||
41716 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41717 return SDValue();
41718
41719 Chain.push_back(V);
41720 continue;
41721
41722 case X86ISD::PSHUFHW:
41723 // Check that the high words (being shuffled) are the identity in the
41724 // dword shuffle, and the low words are self-contained.
41725 if (Mask[2] != 2 || Mask[3] != 3 ||
41726 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41727 return SDValue();
41728
41729 Chain.push_back(V);
41730 continue;
41731
41732 case X86ISD::UNPCKL:
41733 case X86ISD::UNPCKH:
41734 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41735 // shuffle into a preceding word shuffle.
41736 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41737 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41738 return SDValue();
41739
41740 // Search for a half-shuffle which we can combine with.
41741 unsigned CombineOp =
41742 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41743 if (V.getOperand(0) != V.getOperand(1) ||
41744 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41745 return SDValue();
41746 Chain.push_back(V);
41747 V = V.getOperand(0);
41748 do {
41749 switch (V.getOpcode()) {
41750 default:
41751 return SDValue(); // Nothing to combine.
41752
41753 case X86ISD::PSHUFLW:
41754 case X86ISD::PSHUFHW:
41755 if (V.getOpcode() == CombineOp)
41756 break;
41757
41758 Chain.push_back(V);
41759
41760 [[fallthrough]];
41761 case ISD::BITCAST:
41762 V = V.getOperand(0);
41763 continue;
41764 }
41765 break;
41766 } while (V.hasOneUse());
41767 break;
41768 }
41769 // Break out of the loop if we break out of the switch.
41770 break;
41771 }
41772
41773 if (!V.hasOneUse())
41774 // We fell out of the loop without finding a viable combining instruction.
41775 return SDValue();
41776
41777 // Merge this node's mask and our incoming mask.
41779 for (int &M : Mask)
41780 M = VMask[M];
41781 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41782 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41783
41784 // Rebuild the chain around this new shuffle.
41785 while (!Chain.empty()) {
41786 SDValue W = Chain.pop_back_val();
41787
41788 if (V.getValueType() != W.getOperand(0).getValueType())
41789 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41790
41791 switch (W.getOpcode()) {
41792 default:
41793 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41794
41795 case X86ISD::UNPCKL:
41796 case X86ISD::UNPCKH:
41797 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41798 break;
41799
41800 case X86ISD::PSHUFD:
41801 case X86ISD::PSHUFLW:
41802 case X86ISD::PSHUFHW:
41803 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41804 break;
41805 }
41806 }
41807 if (V.getValueType() != N.getValueType())
41808 V = DAG.getBitcast(N.getValueType(), V);
41809
41810 // Return the new chain to replace N.
41811 return V;
41812}
41813
41814// Attempt to commute shufps LHS loads:
41815// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41817 SelectionDAG &DAG) {
41818 // TODO: Add vXf64 support.
41819 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41820 return SDValue();
41821
41822 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41823 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41824 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41825 return SDValue();
41826 SDValue N0 = V.getOperand(0);
41827 SDValue N1 = V.getOperand(1);
41828 unsigned Imm = V.getConstantOperandVal(2);
41829 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41830 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41832 return SDValue();
41833 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41834 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41835 DAG.getTargetConstant(Imm, DL, MVT::i8));
41836 };
41837
41838 switch (N.getOpcode()) {
41839 case X86ISD::VPERMILPI:
41840 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41841 unsigned Imm = N.getConstantOperandVal(1);
41842 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41843 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41844 }
41845 break;
41846 case X86ISD::SHUFP: {
41847 SDValue N0 = N.getOperand(0);
41848 SDValue N1 = N.getOperand(1);
41849 unsigned Imm = N.getConstantOperandVal(2);
41850 if (N0 == N1) {
41851 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41852 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41853 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41854 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41855 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41856 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41857 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41858 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41859 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41860 }
41861 break;
41862 }
41863 }
41864
41865 return SDValue();
41866}
41867
41868// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41869// iff we don't demand the same element index for both X and Y.
41870static SDValue
41872 const APInt &DemandedElts, SelectionDAG &DAG,
41873 const X86Subtarget &Subtarget, const SDLoc &DL) {
41874 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41875 if (!N0.hasOneUse() || !N1.hasOneUse())
41876 return SDValue();
41877
41878 unsigned NumElts = VT.getVectorNumElements();
41881
41882 // See if both operands are shuffles, and that we can scale the shuffle masks
41883 // to the same width as the blend mask.
41884 // TODO: Support SM_SentinelZero?
41885 SmallVector<SDValue, 2> Ops0, Ops1;
41886 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41887 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41888 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41889 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41890 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41891 return SDValue();
41892
41893 // Determine the demanded elts from both permutes.
41894 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41895 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41896 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41897 Demanded1,
41898 /*AllowUndefElts=*/true) ||
41899 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41900 DemandedRHS0, /*AllowUndefElts=*/true) ||
41901 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41902 DemandedRHS1, /*AllowUndefElts=*/true))
41903 return SDValue();
41904
41905 // Confirm that we only use a single operand from both permutes and that we
41906 // don't demand the same index from both.
41907 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41908 DemandedLHS0.intersects(DemandedLHS1))
41909 return SDValue();
41910
41911 // Use the permute demanded elts masks as the new blend mask.
41912 // Create the new permute mask as a blend of the 2 original permute masks.
41913 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41914 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41915 for (unsigned I = 0; I != NumElts; ++I) {
41916 if (Demanded0[I]) {
41917 int M = ScaledMask0[I];
41918 if (0 <= M) {
41919 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41920 "BlendMask demands LHS AND RHS");
41921 NewBlendMask[M] = M;
41922 NewPermuteMask[I] = M;
41923 }
41924 } else if (Demanded1[I]) {
41925 int M = ScaledMask1[I];
41926 if (0 <= M) {
41927 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41928 "BlendMask demands LHS AND RHS");
41929 NewBlendMask[M] = M + NumElts;
41930 NewPermuteMask[I] = M;
41931 }
41932 }
41933 }
41934 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41935 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41936
41937 // v16i16 shuffles can explode in complexity very easily, only accept them if
41938 // the blend mask is the same in the 128-bit subvectors (or can widen to
41939 // v8i32) and the permute can be widened as well.
41940 if (VT == MVT::v16i16) {
41941 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41942 !canWidenShuffleElements(NewBlendMask))
41943 return SDValue();
41944 if (!canWidenShuffleElements(NewPermuteMask))
41945 return SDValue();
41946 }
41947
41948 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41949 // widened to a lane permute (vperm2f128).
41950 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41952 NewPermuteMask) &&
41953 !canScaleShuffleElements(NewPermuteMask, 2))
41954 return SDValue();
41955
41956 SDValue NewBlend =
41957 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41958 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41959 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41960 NewPermuteMask);
41961}
41962
41963// TODO - move this to TLI like isBinOp?
41964static bool isUnaryOp(unsigned Opcode) {
41965 switch (Opcode) {
41966 case ISD::CTLZ:
41967 case ISD::CTTZ:
41968 case ISD::CTPOP:
41969 return true;
41970 }
41971 return false;
41972}
41973
41974// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41975// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41977 const SDLoc &DL) {
41978 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41979 EVT ShuffleVT = N.getValueType();
41980 unsigned Opc = N.getOpcode();
41981
41982 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
41983 // AllZeros/AllOnes constants are freely shuffled and will peek through
41984 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41985 // merge with target shuffles if it has one use so shuffle combining is
41986 // likely to kick in. Shuffles of splats are expected to be removed.
41987 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41988 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41991 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
41992 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
41993 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41994 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
41995 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41996 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41997 };
41998 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41999 // Ensure we only shuffle whole vector src elements, unless its a logical
42000 // binops where we can more aggressively move shuffles from dst to src.
42001 return isLogicOp(BinOp) ||
42002 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42003 };
42004
42005 switch (Opc) {
42006 // Unary and Unary+Permute Shuffles.
42007 case X86ISD::PSHUFB: {
42008 // Don't merge PSHUFB if it contains zero'd elements.
42009 SmallVector<int> Mask;
42011 if (!getTargetShuffleMask(N, false, Ops, Mask))
42012 break;
42013 [[fallthrough]];
42014 }
42015 case X86ISD::VBROADCAST:
42016 case X86ISD::MOVDDUP:
42017 case X86ISD::PSHUFD:
42018 case X86ISD::PSHUFHW:
42019 case X86ISD::PSHUFLW:
42020 case X86ISD::VPERMV:
42021 case X86ISD::VPERMI:
42022 case X86ISD::VPERMILPI: {
42023 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42024 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42025 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42026 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42027 unsigned SrcOpcode = N0.getOpcode();
42028 EVT OpVT = N0.getValueType();
42029 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42032 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42033 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42034 IsMergeableWithShuffle(Op01, FoldShuf)) {
42035 SDValue LHS, RHS;
42036 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42037 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42038 if (Opc == X86ISD::VPERMV) {
42039 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42040 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42041 } else if (N.getNumOperands() == 2) {
42042 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42043 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42044 } else {
42045 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42046 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42047 }
42048 return DAG.getBitcast(ShuffleVT,
42049 DAG.getNode(SrcOpcode, DL, OpVT,
42050 DAG.getBitcast(OpVT, LHS),
42051 DAG.getBitcast(OpVT, RHS)));
42052 }
42053 }
42054 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42055 OpVT.getScalarSizeInBits() ==
42057 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42058 if (Opc == X86ISD::VPERMV)
42059 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42060 else if (N.getNumOperands() == 2)
42061 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42062 else
42063 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42064 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42065 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42066 }
42067 }
42068 break;
42069 }
42070 // Binary and Binary+Permute Shuffles.
42071 case X86ISD::INSERTPS: {
42072 // Don't merge INSERTPS if it contains zero'd elements.
42073 unsigned InsertPSMask = N.getConstantOperandVal(2);
42074 unsigned ZeroMask = InsertPSMask & 0xF;
42075 if (ZeroMask != 0)
42076 break;
42077 [[fallthrough]];
42078 }
42079 case X86ISD::MOVSD:
42080 case X86ISD::MOVSS:
42081 case X86ISD::BLENDI:
42082 case X86ISD::SHUFP:
42083 case X86ISD::UNPCKH:
42084 case X86ISD::UNPCKL: {
42085 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42086 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42087 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42088 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42089 unsigned SrcOpcode = N0.getOpcode();
42090 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42091 N0.getValueType() == N1.getValueType() &&
42092 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42093 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42098 // Ensure the total number of shuffles doesn't increase by folding this
42099 // shuffle through to the source ops.
42100 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42101 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42102 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42103 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42104 SDValue LHS, RHS;
42105 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42106 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42107 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42108 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42109 if (N.getNumOperands() == 3) {
42110 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42111 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42112 } else {
42113 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42114 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42115 }
42116 EVT OpVT = N0.getValueType();
42117 return DAG.getBitcast(ShuffleVT,
42118 DAG.getNode(SrcOpcode, DL, OpVT,
42119 DAG.getBitcast(OpVT, LHS),
42120 DAG.getBitcast(OpVT, RHS)));
42121 }
42122 }
42123 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42124 N0.getValueType() == N1.getValueType() &&
42125 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42126 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42129 SDValue Res;
42130 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42131 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42132 if (N.getNumOperands() == 3) {
42133 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42134 } else {
42135 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42136 }
42137 EVT OpVT = N0.getValueType();
42138 return DAG.getBitcast(
42139 ShuffleVT,
42140 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42141 }
42142 // TODO: We can generalize this for other shuffles/conversions.
42143 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42144 N1.getOpcode() == SrcOpcode &&
42145 N0.getValueType() == N1.getValueType() &&
42146 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42147 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42148 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42149 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42150 EVT OpSrcVT = N0.getOperand(0).getValueType();
42151 EVT OpDstVT = N0.getValueType();
42152 SDValue Res =
42153 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42154 return DAG.getBitcast(ShuffleVT,
42155 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42156 }
42157 }
42158 break;
42159 }
42160 }
42161 return SDValue();
42162}
42163
42164/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42166 SelectionDAG &DAG,
42167 const SDLoc &DL) {
42168 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42169
42170 MVT VT = V.getSimpleValueType();
42171 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42172 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42173 unsigned SrcOpc0 = Src0.getOpcode();
42174 unsigned SrcOpc1 = Src1.getOpcode();
42175 EVT SrcVT0 = Src0.getValueType();
42176 EVT SrcVT1 = Src1.getValueType();
42177
42178 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42179 return SDValue();
42180
42181 switch (SrcOpc0) {
42182 case X86ISD::MOVDDUP: {
42183 SDValue LHS = Src0.getOperand(0);
42184 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42185 SDValue Res =
42186 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42187 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42188 return DAG.getBitcast(VT, Res);
42189 }
42190 case X86ISD::VPERMILPI:
42191 // TODO: Handle v4f64 permutes with different low/high lane masks.
42192 if (SrcVT0 == MVT::v4f64) {
42193 uint64_t Mask = Src0.getConstantOperandVal(1);
42194 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42195 break;
42196 }
42197 [[fallthrough]];
42198 case X86ISD::VSHLI:
42199 case X86ISD::VSRLI:
42200 case X86ISD::VSRAI:
42201 case X86ISD::PSHUFD:
42202 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42203 SDValue LHS = Src0.getOperand(0);
42204 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42205 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42206 V.getOperand(2));
42207 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42208 return DAG.getBitcast(VT, Res);
42209 }
42210 break;
42211 }
42212
42213 return SDValue();
42214}
42215
42216/// Try to combine x86 target specific shuffles.
42218 SelectionDAG &DAG,
42220 const X86Subtarget &Subtarget) {
42221 using namespace SDPatternMatch;
42222
42223 MVT VT = N.getSimpleValueType();
42224 unsigned NumElts = VT.getVectorNumElements();
42226 unsigned Opcode = N.getOpcode();
42227 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42228
42229 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42230 return R;
42231
42232 // Handle specific target shuffles.
42233 switch (Opcode) {
42234 case X86ISD::MOVDDUP: {
42235 SDValue Src = N.getOperand(0);
42236 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42237 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42238 ISD::isNormalLoad(Src.getNode())) {
42239 LoadSDNode *LN = cast<LoadSDNode>(Src);
42240 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42241 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42242 DCI.CombineTo(N.getNode(), Movddup);
42243 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42245 return N; // Return N so it doesn't get rechecked!
42246 }
42247 }
42248
42249 return SDValue();
42250 }
42251 case X86ISD::VBROADCAST: {
42252 SDValue Src = N.getOperand(0);
42253 SDValue BC = peekThroughBitcasts(Src);
42254 EVT SrcVT = Src.getValueType();
42255 EVT BCVT = BC.getValueType();
42256
42257 // If broadcasting from another shuffle, attempt to simplify it.
42258 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42259 if (isTargetShuffle(BC.getOpcode()) &&
42260 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42261 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42262 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42264 for (unsigned i = 0; i != Scale; ++i)
42265 DemandedMask[i] = i;
42267 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42268 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42269 /*AllowVariableCrossLaneMask=*/true,
42270 /*AllowVariablePerLaneMask=*/true,
42271 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42272 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42273 DAG.getBitcast(SrcVT, Res));
42274 }
42275
42276 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42277 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42278 if (Src.getOpcode() == ISD::BITCAST &&
42279 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42280 TLI.isTypeLegal(BCVT) &&
42282 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42283 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42285 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42286 }
42287
42288 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42289 // If we're re-broadcasting a smaller type then broadcast with that type and
42290 // bitcast.
42291 // TODO: Do this for any splat?
42292 if (Src.getOpcode() == ISD::BITCAST &&
42293 (BC.getOpcode() == X86ISD::VBROADCAST ||
42295 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42296 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42297 MVT NewVT =
42299 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42300 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42301 }
42302
42303 // Reduce broadcast source vector to lowest 128-bits.
42304 if (SrcVT.getSizeInBits() > 128)
42305 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42306 extract128BitVector(Src, 0, DAG, DL));
42307
42308 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42309 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42310 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42311 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42312
42313 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42314 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42315 isNullConstant(Src.getOperand(1)) &&
42316 Src.getValueType() ==
42317 Src.getOperand(0).getValueType().getScalarType() &&
42318 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42319 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42320
42321 // Share broadcast with the longest vector and extract low subvector (free).
42322 // Ensure the same SDValue from the SDNode use is being used.
42323 for (SDNode *User : Src->users())
42324 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42325 Src == User->getOperand(0) &&
42326 User->getValueSizeInBits(0).getFixedValue() >
42327 VT.getFixedSizeInBits()) {
42328 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42329 VT.getSizeInBits());
42330 }
42331
42332 // vbroadcast(scalarload X) -> vbroadcast_load X
42333 // For float loads, extract other uses of the scalar from the broadcast.
42334 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42335 ISD::isNormalLoad(Src.getNode())) {
42336 LoadSDNode *LN = cast<LoadSDNode>(Src);
42337 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42338 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42339 SDValue BcastLd =
42341 LN->getMemoryVT(), LN->getMemOperand());
42342 // If the load value is used only by N, replace it via CombineTo N.
42343 bool NoReplaceExtract = Src.hasOneUse();
42344 DCI.CombineTo(N.getNode(), BcastLd);
42345 if (NoReplaceExtract) {
42346 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42348 } else {
42349 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42350 DAG.getVectorIdxConstant(0, DL));
42351 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42352 }
42353 return N; // Return N so it doesn't get rechecked!
42354 }
42355
42356 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42357 // i16. So shrink it ourselves if we can make a broadcast_load.
42358 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42359 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42360 assert(Subtarget.hasAVX2() && "Expected AVX2");
42361 SDValue TruncIn = Src.getOperand(0);
42362
42363 // If this is a truncate of a non extending load we can just narrow it to
42364 // use a broadcast_load.
42365 if (ISD::isNormalLoad(TruncIn.getNode())) {
42366 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42367 // Unless its volatile or atomic.
42368 if (LN->isSimple()) {
42369 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42370 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42371 SDValue BcastLd = DAG.getMemIntrinsicNode(
42372 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42373 LN->getPointerInfo(), LN->getBaseAlign(),
42374 LN->getMemOperand()->getFlags());
42375 DCI.CombineTo(N.getNode(), BcastLd);
42376 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42377 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42378 return N; // Return N so it doesn't get rechecked!
42379 }
42380 }
42381
42382 // If this is a truncate of an i16 extload, we can directly replace it.
42383 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42384 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42385 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42386 if (LN->getMemoryVT().getSizeInBits() == 16) {
42387 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42388 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42389 SDValue BcastLd =
42391 LN->getMemoryVT(), LN->getMemOperand());
42392 DCI.CombineTo(N.getNode(), BcastLd);
42393 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42394 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42395 return N; // Return N so it doesn't get rechecked!
42396 }
42397 }
42398
42399 // If this is a truncate of load that has been shifted right, we can
42400 // offset the pointer and use a narrower load.
42401 if (TruncIn.getOpcode() == ISD::SRL &&
42402 TruncIn.getOperand(0).hasOneUse() &&
42403 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42404 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42405 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42406 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42407 // Make sure the shift amount and the load size are divisible by 16.
42408 // Don't do this if the load is volatile or atomic.
42409 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42410 LN->isSimple()) {
42411 unsigned Offset = ShiftAmt / 8;
42412 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42415 SDValue Ops[] = { LN->getChain(), Ptr };
42416 SDValue BcastLd = DAG.getMemIntrinsicNode(
42417 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42419 LN->getMemOperand()->getFlags());
42420 DCI.CombineTo(N.getNode(), BcastLd);
42421 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42422 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42423 return N; // Return N so it doesn't get rechecked!
42424 }
42425 }
42426 }
42427
42428 // vbroadcast(vzload X) -> vbroadcast_load X
42429 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42430 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
42431 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42432 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42433 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42434 SDValue BcastLd =
42436 LN->getMemoryVT(), LN->getMemOperand());
42437 DCI.CombineTo(N.getNode(), BcastLd);
42438 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42440 return N; // Return N so it doesn't get rechecked!
42441 }
42442 }
42443
42444 // vbroadcast(vector load X) -> vbroadcast_load
42445 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42446 LoadSDNode *LN = cast<LoadSDNode>(Src);
42447 // Unless the load is volatile or atomic.
42448 if (LN->isSimple()) {
42449 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42450 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42451 SDValue BcastLd = DAG.getMemIntrinsicNode(
42452 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42453 LN->getPointerInfo(), LN->getBaseAlign(),
42454 LN->getMemOperand()->getFlags());
42455 DCI.CombineTo(N.getNode(), BcastLd);
42456 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42458 return N; // Return N so it doesn't get rechecked!
42459 }
42460 }
42461
42462 return SDValue();
42463 }
42464 case X86ISD::VZEXT_MOVL: {
42465 SDValue N0 = N.getOperand(0);
42466
42467 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42468 // Zeroing out the upper elements means we're just shifting a zero value.
42469 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42470 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42471 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42472 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42473 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42474 if (N0.hasOneUse())
42475 return DAG.getNode(
42476 N0.getOpcode(), DL, VT,
42477 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42478 N0.getOperand(1));
42479 }
42480
42481 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42482 // the load is volatile.
42483 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42484 auto *LN = cast<LoadSDNode>(N0);
42485 if (SDValue VZLoad =
42486 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42487 DCI.CombineTo(N.getNode(), VZLoad);
42488 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42490 return N;
42491 }
42492 }
42493
42494 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42495 // and can just use a VZEXT_LOAD.
42496 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42497 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42498 auto *LN = cast<MemSDNode>(N0);
42499 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42500 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42501 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42502 SDValue VZLoad =
42504 LN->getMemoryVT(), LN->getMemOperand());
42505 DCI.CombineTo(N.getNode(), VZLoad);
42506 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42508 return N;
42509 }
42510 }
42511
42512 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42513 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42514 // if the upper bits of the i64 are zero.
42515 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42516 N0.getOperand(0).hasOneUse() &&
42517 N0.getOperand(0).getValueType() == MVT::i64) {
42518 SDValue In = N0.getOperand(0);
42519 APInt Mask = APInt::getHighBitsSet(64, 32);
42520 if (DAG.MaskedValueIsZero(In, Mask)) {
42521 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42522 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42523 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42524 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42525 return DAG.getBitcast(VT, Movl);
42526 }
42527 }
42528
42529 // Load a scalar integer constant directly to XMM instead of transferring an
42530 // immediate value from GPR.
42531 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42532 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42533 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42534 // Create a vector constant - scalar constant followed by zeros.
42535 EVT ScalarVT = N0.getOperand(0).getValueType();
42536 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42537 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42538 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42539 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42540
42541 // Load the vector constant from constant pool.
42542 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42543 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42544 MachinePointerInfo MPI =
42546 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42547 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42549 }
42550 }
42551
42552 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42553 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42554 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42555 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42556 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42558
42559 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42560 isNullConstant(V.getOperand(2))) {
42561 SDValue In = V.getOperand(1);
42563 In.getValueSizeInBits() /
42564 VT.getScalarSizeInBits());
42565 In = DAG.getBitcast(SubVT, In);
42566 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42567 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42568 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42569 V.getOperand(2));
42570 }
42571 }
42572
42573 return SDValue();
42574 }
42575 case X86ISD::BLENDI: {
42576 SDValue N0 = N.getOperand(0);
42577 SDValue N1 = N.getOperand(1);
42578 unsigned EltBits = VT.getScalarSizeInBits();
42579
42580 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42581 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42582 // TODO: Handle MVT::v16i16 repeated blend mask.
42583 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42584 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42585 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42586 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42587 unsigned NewSize = SrcVT.getVectorNumElements();
42588 APInt BlendMask = getBLENDIBlendMask(N);
42589 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42590 return DAG.getBitcast(
42591 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42592 N1.getOperand(0),
42593 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42594 DL, MVT::i8)));
42595 }
42596 }
42597 // Share PSHUFB masks:
42598 // blend(pshufb(x,m1),pshufb(y,m2))
42599 // --> m3 = blend(m1,m2)
42600 // blend(pshufb(x,m3),pshufb(y,m3))
42601 if (N0.hasOneUse() && N1.hasOneUse()) {
42602 SmallVector<int> Mask, ByteMask;
42606 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42607 RHS.getOpcode() == X86ISD::PSHUFB &&
42608 LHS.getOperand(1) != RHS.getOperand(1) &&
42609 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42610 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42611 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42612 RHS == peekThroughOneUseBitcasts(Ops[1]) &&
42613 "BLENDI decode mismatch");
42614 MVT ShufVT = LHS.getSimpleValueType();
42615 SDValue MaskLHS = LHS.getOperand(1);
42616 SDValue MaskRHS = RHS.getOperand(1);
42617 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42619 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42620 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42621 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42622 LHS.getOperand(0), NewMask);
42623 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42624 RHS.getOperand(0), NewMask);
42625 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42626 DAG.getBitcast(VT, NewLHS),
42627 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42628 }
42629 }
42630 }
42631 }
42632 return SDValue();
42633 }
42634 case X86ISD::SHUFP: {
42635 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42636 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42637 // TODO: Support types other than v4f32.
42638 if (VT == MVT::v4f32) {
42639 bool Updated = false;
42640 SmallVector<int> Mask;
42642 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42643 for (int i = 0; i != 2; ++i) {
42644 SmallVector<SDValue> SubOps;
42645 SmallVector<int> SubMask, SubScaledMask;
42647 // TODO: Scaling might be easier if we specify the demanded elts.
42648 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42649 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42650 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42651 int Ofs = i * 2;
42652 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42653 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42654 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42655 Updated = true;
42656 }
42657 }
42658 }
42659 if (Updated) {
42660 for (int &M : Mask)
42661 M %= 4;
42662 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42663 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42664 }
42665 }
42666 return SDValue();
42667 }
42668 case X86ISD::VPERMI: {
42669 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42670 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42671 SDValue N0 = N.getOperand(0);
42672 SDValue N1 = N.getOperand(1);
42673 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42674 if (N0.getOpcode() == ISD::BITCAST &&
42675 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42676 SDValue Src = N0.getOperand(0);
42677 EVT SrcVT = Src.getValueType();
42678 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42679 return DAG.getBitcast(VT, Res);
42680 }
42681 return SDValue();
42682 }
42683 case X86ISD::SHUF128: {
42684 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42685 // see if we can peek through and access the subvector directly.
42686 if (VT.is512BitVector()) {
42687 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42688 // the upper subvector is used.
42689 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42690 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42691 uint64_t Mask = N->getConstantOperandVal(2);
42692 SmallVector<SDValue> LHSOps, RHSOps;
42693 SDValue NewLHS, NewRHS;
42694 if ((Mask & 0x0A) == 0x0A &&
42695 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42696 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42697 Mask &= ~0x0A;
42698 }
42699 if ((Mask & 0xA0) == 0xA0 &&
42700 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42701 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42702 Mask &= ~0xA0;
42703 }
42704 if (NewLHS || NewRHS)
42705 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42706 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42707 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42708 DAG.getTargetConstant(Mask, DL, MVT::i8));
42709 }
42710 return SDValue();
42711 }
42712 case X86ISD::VPERM2X128: {
42713 SDValue LHS = N->getOperand(0);
42714 SDValue RHS = N->getOperand(1);
42715 unsigned Imm = N.getConstantOperandVal(2) & 255;
42716
42717 // Canonicalize unary/repeated operands to LHS.
42718 if (LHS.isUndef() && !RHS.isUndef())
42719 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42720 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42721 if (LHS == RHS)
42722 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42723 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42724
42725 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42726 if (LHS.getOpcode() == ISD::BITCAST &&
42727 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42728 EVT SrcVT = LHS.getOperand(0).getValueType();
42729 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42730 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42731 DAG.getBitcast(SrcVT, LHS),
42732 DAG.getBitcast(SrcVT, RHS),
42733 N->getOperand(2)));
42734 }
42735 }
42736
42737 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42739 return Res;
42740
42741 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42742 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42743 auto FindSubVector128 = [&](unsigned Idx) {
42744 if (Idx > 3)
42745 return SDValue();
42746 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42747 SmallVector<SDValue> SubOps;
42748 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42749 return SubOps[Idx & 1];
42750 unsigned NumElts = Src.getValueType().getVectorNumElements();
42751 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42752 Src.getOperand(1).getValueSizeInBits() == 128 &&
42753 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42754 return Src.getOperand(1);
42755 }
42756 return SDValue();
42757 };
42758 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42759 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42760 MVT SubVT = VT.getHalfNumVectorElementsVT();
42761 SubLo = DAG.getBitcast(SubVT, SubLo);
42762 SubHi = DAG.getBitcast(SubVT, SubHi);
42763 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42764 }
42765 }
42766
42767 // Attempt to match VBROADCAST*128 subvector broadcast load.
42768 if (RHS.isUndef()) {
42770 DecodeVPERM2X128Mask(4, Imm, Mask);
42771 if (isUndefOrInRange(Mask, 0, 4)) {
42772 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42773 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42774 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42775 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42776 MVT MemVT = VT.getHalfNumVectorElementsVT();
42777 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42779 cast<LoadSDNode>(LHS), Ofs, DAG);
42780 }
42781 }
42782 }
42783
42784 return SDValue();
42785 }
42786 case X86ISD::PSHUFD:
42787 case X86ISD::PSHUFLW:
42788 case X86ISD::PSHUFHW: {
42789 SDValue N0 = N.getOperand(0);
42790 SDValue N1 = N.getOperand(1);
42791 if (N0->hasOneUse()) {
42793 switch (V.getOpcode()) {
42794 case X86ISD::VSHL:
42795 case X86ISD::VSRL:
42796 case X86ISD::VSRA:
42797 case X86ISD::VSHLI:
42798 case X86ISD::VSRLI:
42799 case X86ISD::VSRAI:
42800 case X86ISD::VROTLI:
42801 case X86ISD::VROTRI: {
42802 MVT InnerVT = V.getSimpleValueType();
42803 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42804 SDValue Res = DAG.getNode(Opcode, DL, VT,
42805 DAG.getBitcast(VT, V.getOperand(0)), N1);
42806 Res = DAG.getBitcast(InnerVT, Res);
42807 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42808 return DAG.getBitcast(VT, Res);
42809 }
42810 break;
42811 }
42812 }
42813 }
42814
42815 Mask = getPSHUFShuffleMask(N);
42816 assert(Mask.size() == 4);
42817 break;
42818 }
42819 case X86ISD::MOVSD:
42820 case X86ISD::MOVSH:
42821 case X86ISD::MOVSS: {
42822 SDValue N0 = N.getOperand(0);
42823 SDValue N1 = N.getOperand(1);
42824
42825 // Canonicalize scalar FPOps:
42826 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42827 // If commutable, allow OP(N1[0], N0[0]).
42828 unsigned Opcode1 = N1.getOpcode();
42829 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42830 Opcode1 == ISD::FDIV) {
42831 SDValue N10 = N1.getOperand(0);
42832 SDValue N11 = N1.getOperand(1);
42833 if (N10 == N0 ||
42834 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42835 if (N10 != N0)
42836 std::swap(N10, N11);
42837 MVT SVT = VT.getVectorElementType();
42838 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42839 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42840 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42841 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42842 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42843 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42844 }
42845 }
42846
42847 return SDValue();
42848 }
42849 case X86ISD::INSERTPS: {
42850 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42851 SDValue Op0 = N.getOperand(0);
42852 SDValue Op1 = N.getOperand(1);
42853 unsigned InsertPSMask = N.getConstantOperandVal(2);
42854 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42855 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42856 unsigned ZeroMask = InsertPSMask & 0xF;
42857
42858 // If we zero out all elements from Op0 then we don't need to reference it.
42859 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42860 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42861 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42862
42863 // If we zero out the element from Op1 then we don't need to reference it.
42864 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42865 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42866 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42867
42868 // Attempt to merge insertps Op1 with an inner target shuffle node.
42869 SmallVector<int, 8> TargetMask1;
42871 APInt KnownUndef1, KnownZero1;
42872 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42873 KnownZero1)) {
42874 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42875 // Zero/UNDEF insertion - zero out element and remove dependency.
42876 InsertPSMask |= (1u << DstIdx);
42877 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42878 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42879 }
42880 // Update insertps mask srcidx and reference the source input directly.
42881 int M = TargetMask1[SrcIdx];
42882 assert(0 <= M && M < 8 && "Shuffle index out of range");
42883 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42884 Op1 = Ops1[M < 4 ? 0 : 1];
42885 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42886 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42887 }
42888
42889 // Attempt to merge insertps Op0 with an inner target shuffle node.
42890 SmallVector<int, 8> TargetMask0;
42892 APInt KnownUndef0, KnownZero0;
42893 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42894 KnownZero0)) {
42895 bool Updated = false;
42896 bool UseInput00 = false;
42897 bool UseInput01 = false;
42898 for (int i = 0; i != 4; ++i) {
42899 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42900 // No change if element is already zero or the inserted element.
42901 continue;
42902 }
42903
42904 if (KnownUndef0[i] || KnownZero0[i]) {
42905 // If the target mask is undef/zero then we must zero the element.
42906 InsertPSMask |= (1u << i);
42907 Updated = true;
42908 continue;
42909 }
42910
42911 // The input vector element must be inline.
42912 int M = TargetMask0[i];
42913 if (M != i && M != (i + 4))
42914 return SDValue();
42915
42916 // Determine which inputs of the target shuffle we're using.
42917 UseInput00 |= (0 <= M && M < 4);
42918 UseInput01 |= (4 <= M);
42919 }
42920
42921 // If we're not using both inputs of the target shuffle then use the
42922 // referenced input directly.
42923 if (UseInput00 && !UseInput01) {
42924 Updated = true;
42925 Op0 = Ops0[0];
42926 } else if (!UseInput00 && UseInput01) {
42927 Updated = true;
42928 Op0 = Ops0[1];
42929 }
42930
42931 if (Updated)
42932 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42933 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42934 }
42935
42936 // If we're inserting an element from a vbroadcast load, fold the
42937 // load into the X86insertps instruction. We need to convert the scalar
42938 // load to a vector and clear the source lane of the INSERTPS control.
42939 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42940 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42941 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42942 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42943 MemIntr->getBasePtr(),
42944 MemIntr->getMemOperand());
42945 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42947 Load),
42948 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42949 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42950 return Insert;
42951 }
42952 }
42953
42954 return SDValue();
42955 }
42956 case X86ISD::VPERMV: {
42957 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
42959 SmallVector<SDValue, 2> SrcOps, SubOps;
42960 SDValue Src = peekThroughBitcasts(N.getOperand(1));
42961 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
42962 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
42963 collectConcatOps(Src.getNode(), SubOps, DAG)) {
42964 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42965 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
42966 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
42967 "Unexpected split ops");
42968 // Bail if we were permuting a widened vector.
42969 if (SubOps[1].isUndef() &&
42970 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
42971 return SDValue();
42972 // Bail if any subops would have folded into the concat.
42973 if (any_of(SubOps, isShuffleFoldableLoad))
42974 return SDValue();
42975 // Concat 4x128 back to 2x256.
42976 if (SubOps.size() == 4) {
42977 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
42978 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
42979 }
42980 // Convert mask to 2 operand shuffle.
42981 int HalfElts = NumElts / 2;
42982 for (int &M : Mask)
42983 M += M >= HalfElts ? HalfElts : 0;
42984 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
42985 VT.getSizeInBits());
42986 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
42987 VT.getSizeInBits());
42988 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
42989 DAG.getBitcast(VT, Hi), Subtarget, DAG);
42990 }
42991 return SDValue();
42992 }
42993 case X86ISD::VPERMV3: {
42994 MVT WideVT = VT.getDoubleNumVectorElementsVT();
42995 bool CanConcat = VT.is128BitVector() ||
42996 (VT.is256BitVector() && Subtarget.useAVX512Regs());
42999 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43000 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43001 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43002 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43003 // Canonicalize to VPERMV if both sources are the same.
43004 if (V1 == V2) {
43005 for (int &M : Mask)
43006 M = (M < 0 ? M : (M & (NumElts - 1)));
43007 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43008 DAG.getUNDEF(VT), Subtarget, DAG);
43009 }
43010 // If sources are half width, then concat and use VPERMV with adjusted
43011 // mask.
43012 SDValue Ops[2];
43013 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43014 if (sd_match(V1,
43015 m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&
43016 sd_match(V2,
43017 m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&
43018 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43019 if (SDValue ConcatSrc =
43020 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43021 for (int &M : Mask)
43022 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43023 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43024 DAG.getUNDEF(VT), Subtarget, DAG);
43025 }
43026 }
43027 // Commute foldable source to the RHS.
43028 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43029 !isShuffleFoldableLoad(N.getOperand(2))) {
43031 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43032 N.getOperand(0), Subtarget, DAG);
43033 }
43034 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43035 // freely concatenated, with a commuted shuffle mask.
43036 if (CanConcat) {
43037 if (SDValue ConcatSrc = combineConcatVectorOps(
43038 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43039 Subtarget)) {
43041 Mask.append(NumElts, SM_SentinelUndef);
43042 SDValue Perm =
43043 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43044 DAG.getUNDEF(WideVT), Subtarget, DAG);
43045 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43046 DAG.getVectorIdxConstant(0, DL));
43047 }
43048 }
43049 }
43050 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43051 // freely concatenated.
43052 if (CanConcat) {
43053 if (SDValue ConcatSrc = combineConcatVectorOps(
43054 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43055 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43056 DL, WideVT.getSizeInBits());
43057 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43058 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43059 DAG.getVectorIdxConstant(0, DL));
43060 }
43061 }
43062 return SDValue();
43063 }
43064 default:
43065 return SDValue();
43066 }
43067
43068 // Nuke no-op shuffles that show up after combining.
43069 if (isNoopShuffleMask(Mask))
43070 return N.getOperand(0);
43071
43072 // Look for simplifications involving one or two shuffle instructions.
43073 SDValue V = N.getOperand(0);
43074 switch (N.getOpcode()) {
43075 default:
43076 break;
43077 case X86ISD::PSHUFLW:
43078 case X86ISD::PSHUFHW:
43079 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43080
43081 // See if this reduces to a PSHUFD which is no more expensive and can
43082 // combine with more operations. Note that it has to at least flip the
43083 // dwords as otherwise it would have been removed as a no-op.
43084 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43085 int DMask[] = {0, 1, 2, 3};
43086 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43087 DMask[DOffset + 0] = DOffset + 1;
43088 DMask[DOffset + 1] = DOffset + 0;
43089 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43090 V = DAG.getBitcast(DVT, V);
43091 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43092 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43093 return DAG.getBitcast(VT, V);
43094 }
43095
43096 // Look for shuffle patterns which can be implemented as a single unpack.
43097 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43098 // only works when we have a PSHUFD followed by two half-shuffles.
43099 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43100 (V.getOpcode() == X86ISD::PSHUFLW ||
43101 V.getOpcode() == X86ISD::PSHUFHW) &&
43102 V.getOpcode() != N.getOpcode() &&
43103 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43104 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43105 if (D.getOpcode() == X86ISD::PSHUFD) {
43108 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43109 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43110 int WordMask[8];
43111 for (int i = 0; i < 4; ++i) {
43112 WordMask[i + NOffset] = Mask[i] + NOffset;
43113 WordMask[i + VOffset] = VMask[i] + VOffset;
43114 }
43115 // Map the word mask through the DWord mask.
43116 int MappedMask[8];
43117 for (int i = 0; i < 8; ++i)
43118 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43119 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43120 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43121 // We can replace all three shuffles with an unpack.
43122 V = DAG.getBitcast(VT, D.getOperand(0));
43123 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43125 DL, VT, V, V);
43126 }
43127 }
43128 }
43129
43130 break;
43131
43132 case X86ISD::PSHUFD:
43133 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43134 return NewN;
43135
43136 break;
43137 }
43138
43139 return SDValue();
43140}
43141
43142/// Checks if the shuffle mask takes subsequent elements
43143/// alternately from two vectors.
43144/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43145static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43146
43147 int ParitySrc[2] = {-1, -1};
43148 unsigned Size = Mask.size();
43149 for (unsigned i = 0; i != Size; ++i) {
43150 int M = Mask[i];
43151 if (M < 0)
43152 continue;
43153
43154 // Make sure we are using the matching element from the input.
43155 if ((M % Size) != i)
43156 return false;
43157
43158 // Make sure we use the same input for all elements of the same parity.
43159 int Src = M / Size;
43160 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43161 return false;
43162 ParitySrc[i % 2] = Src;
43163 }
43164
43165 // Make sure each input is used.
43166 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43167 return false;
43168
43169 Op0Even = ParitySrc[0] == 0;
43170 return true;
43171}
43172
43173/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43174/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43175/// are written to the parameters \p Opnd0 and \p Opnd1.
43176///
43177/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43178/// so it is easier to generically match. We also insert dummy vector shuffle
43179/// nodes for the operands which explicitly discard the lanes which are unused
43180/// by this operation to try to flow through the rest of the combiner
43181/// the fact that they're unused.
43182static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43183 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43184 bool &IsSubAdd, bool &HasAllowContract) {
43185
43186 EVT VT = N->getValueType(0);
43187 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43188 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43190 return false;
43191
43192 // We only handle target-independent shuffles.
43193 // FIXME: It would be easy and harmless to use the target shuffle mask
43194 // extraction tool to support more.
43195 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43196 return false;
43197
43198 SDValue V1 = N->getOperand(0);
43199 SDValue V2 = N->getOperand(1);
43200
43201 // Make sure we have an FADD and an FSUB.
43202 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43203 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43204 V1.getOpcode() == V2.getOpcode())
43205 return false;
43206
43207 // If there are other uses of these operations we can't fold them.
43208 if (!V1->hasOneUse() || !V2->hasOneUse())
43209 return false;
43210
43211 // Ensure that both operations have the same operands. Note that we can
43212 // commute the FADD operands.
43213 SDValue LHS, RHS;
43214 if (V1.getOpcode() == ISD::FSUB) {
43215 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43216 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43217 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43218 return false;
43219 } else {
43220 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43221 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43222 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43223 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43224 return false;
43225 }
43226
43227 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43228 bool Op0Even;
43229 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43230 return false;
43231
43232 // It's a subadd if the vector in the even parity is an FADD.
43233 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43234 : V2->getOpcode() == ISD::FADD;
43235 HasAllowContract =
43237
43238 Opnd0 = LHS;
43239 Opnd1 = RHS;
43240 return true;
43241}
43242
43243/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43245 const X86Subtarget &Subtarget,
43246 SelectionDAG &DAG) {
43247 // We only handle target-independent shuffles.
43248 // FIXME: It would be easy and harmless to use the target shuffle mask
43249 // extraction tool to support more.
43250 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43251 return SDValue();
43252
43253 MVT VT = N->getSimpleValueType(0);
43254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43255 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43256 return SDValue();
43257
43258 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43259 SDValue Op0 = N->getOperand(0);
43260 SDValue Op1 = N->getOperand(1);
43261 SDValue FMAdd = Op0, FMSub = Op1;
43262 if (FMSub.getOpcode() != X86ISD::FMSUB)
43263 std::swap(FMAdd, FMSub);
43264
43265 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43266 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43267 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43268 FMAdd.getOperand(2) != FMSub.getOperand(2))
43269 return SDValue();
43270
43271 // Check for correct shuffle mask.
43272 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43273 bool Op0Even;
43274 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43275 return SDValue();
43276
43277 // FMAddSub takes zeroth operand from FMSub node.
43278 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43279 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43280 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43281 FMAdd.getOperand(2));
43282}
43283
43284/// Try to combine a shuffle into a target-specific add-sub or
43285/// mul-add-sub node.
43287 const X86Subtarget &Subtarget,
43288 SelectionDAG &DAG) {
43289 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43290 return V;
43291
43292 SDValue Opnd0, Opnd1;
43293 bool IsSubAdd;
43294 bool HasAllowContract;
43295 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43296 HasAllowContract))
43297 return SDValue();
43298
43299 MVT VT = N->getSimpleValueType(0);
43300
43301 // Try to generate X86ISD::FMADDSUB node here.
43302 SDValue Opnd2;
43303 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43304 HasAllowContract)) {
43305 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43306 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43307 }
43308
43309 if (IsSubAdd)
43310 return SDValue();
43311
43312 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43313 // the ADDSUB idiom has been successfully recognized. There are no known
43314 // X86 targets with 512-bit ADDSUB instructions!
43315 if (VT.is512BitVector())
43316 return SDValue();
43317
43318 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43319 // the ADDSUB idiom has been successfully recognized. There are no known
43320 // X86 targets with FP16 ADDSUB instructions!
43321 if (VT.getVectorElementType() == MVT::f16)
43322 return SDValue();
43323
43324 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43325}
43326
43327/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43328/// low half of each source vector and does not set any high half elements in
43329/// the destination vector, narrow the shuffle to half its original size.
43331 EVT VT = Shuf->getValueType(0);
43332 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43333 return SDValue();
43334 if (!VT.is256BitVector() && !VT.is512BitVector())
43335 return SDValue();
43336
43337 // See if we can ignore all of the high elements of the shuffle.
43338 ArrayRef<int> Mask = Shuf->getMask();
43339 if (!isUndefUpperHalf(Mask))
43340 return SDValue();
43341
43342 // Check if the shuffle mask accesses only the low half of each input vector
43343 // (half-index output is 0 or 2).
43344 int HalfIdx1, HalfIdx2;
43345 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43346 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43347 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43348 return SDValue();
43349
43350 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43351 // The trick is knowing that all of the insert/extract are actually free
43352 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43353 // of narrow inputs into a narrow output, and that is always cheaper than
43354 // the wide shuffle that we started with.
43355 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43356 Shuf->getOperand(1), HalfMask, HalfIdx1,
43357 HalfIdx2, false, DAG, /*UseConcat*/ true);
43358}
43359
43362 const X86Subtarget &Subtarget) {
43363 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43364 if (SDValue V = narrowShuffle(Shuf, DAG))
43365 return V;
43366
43367 // If we have legalized the vector types, look for blends of FADD and FSUB
43368 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43369 SDLoc dl(N);
43370 EVT VT = N->getValueType(0);
43371 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43372 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43373 if (SDValue AddSub =
43374 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43375 return AddSub;
43376
43377 // Attempt to combine into a vector load/broadcast.
43379 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43380 return LD;
43381
43382 if (isTargetShuffle(N->getOpcode())) {
43383 SDValue Op(N, 0);
43384 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43385 return Shuffle;
43386
43387 // Try recursively combining arbitrary sequences of x86 shuffle
43388 // instructions into higher-order shuffles. We do this after combining
43389 // specific PSHUF instruction sequences into their minimal form so that we
43390 // can evaluate how many specialized shuffle instructions are involved in
43391 // a particular chain.
43392 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43393 return Res;
43394
43395 // Simplify source operands based on shuffle mask.
43396 // TODO - merge this into combineX86ShufflesRecursively.
43397 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43398 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43399 return SDValue(N, 0);
43400
43401 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43402 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43403 // Perform this after other shuffle combines to allow inner shuffles to be
43404 // combined away first.
43405 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43406 return BinOp;
43407 }
43408
43409 return SDValue();
43410}
43411
43412// Simplify variable target shuffle masks based on the demanded elements.
43413// TODO: Handle DemandedBits in mask indices as well?
43415 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43416 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43417 // If we're demanding all elements don't bother trying to simplify the mask.
43418 unsigned NumElts = DemandedElts.getBitWidth();
43419 if (DemandedElts.isAllOnes())
43420 return false;
43421
43422 SDValue Mask = Op.getOperand(MaskIndex);
43423 if (!Mask.hasOneUse())
43424 return false;
43425
43426 // Attempt to generically simplify the variable shuffle mask.
43427 APInt MaskUndef, MaskZero;
43428 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43429 Depth + 1))
43430 return true;
43431
43432 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43433 // TODO: Support other types from getTargetShuffleMaskIndices?
43435 EVT BCVT = BC.getValueType();
43436 auto *Load = dyn_cast<LoadSDNode>(BC);
43437 if (!Load || !Load->getBasePtr().hasOneUse())
43438 return false;
43439
43440 const Constant *C = getTargetConstantFromNode(Load);
43441 if (!C)
43442 return false;
43443
43444 Type *CTy = C->getType();
43445 if (!CTy->isVectorTy() ||
43446 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43447 return false;
43448
43449 // Handle scaling for i64 elements on 32-bit targets.
43450 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43451 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43452 return false;
43453 unsigned Scale = NumCstElts / NumElts;
43454
43455 // Simplify mask if we have an undemanded element that is not undef.
43456 bool Simplified = false;
43457 SmallVector<Constant *, 32> ConstVecOps;
43458 for (unsigned i = 0; i != NumCstElts; ++i) {
43459 Constant *Elt = C->getAggregateElement(i);
43460 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43461 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43462 Simplified = true;
43463 continue;
43464 }
43465 ConstVecOps.push_back(Elt);
43466 }
43467 if (!Simplified)
43468 return false;
43469
43470 // Generate new constant pool entry + legalize immediately for the load.
43471 SDLoc DL(Op);
43472 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43473 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43474 SDValue NewMask = TLO.DAG.getLoad(
43475 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43477 Load->getAlign());
43478 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43479}
43480
43482 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43483 TargetLoweringOpt &TLO, unsigned Depth) const {
43484 int NumElts = DemandedElts.getBitWidth();
43485 unsigned Opc = Op.getOpcode();
43486 EVT VT = Op.getValueType();
43487
43488 // Handle special case opcodes.
43489 switch (Opc) {
43490 case X86ISD::PMULDQ:
43491 case X86ISD::PMULUDQ: {
43492 APInt LHSUndef, LHSZero;
43493 APInt RHSUndef, RHSZero;
43494 SDValue LHS = Op.getOperand(0);
43495 SDValue RHS = Op.getOperand(1);
43496 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43497 Depth + 1))
43498 return true;
43499 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43500 Depth + 1))
43501 return true;
43502 // Multiply by zero.
43503 KnownZero = LHSZero | RHSZero;
43504 break;
43505 }
43506 case X86ISD::VPMADDUBSW:
43507 case X86ISD::VPMADDWD: {
43508 APInt LHSUndef, LHSZero;
43509 APInt RHSUndef, RHSZero;
43510 SDValue LHS = Op.getOperand(0);
43511 SDValue RHS = Op.getOperand(1);
43512 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43513
43514 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43515 Depth + 1))
43516 return true;
43517 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43518 Depth + 1))
43519 return true;
43520
43521 // TODO: Multiply by zero.
43522
43523 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43524 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43525 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43526 Depth + 1))
43527 return true;
43528 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43529 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43530 Depth + 1))
43531 return true;
43532 break;
43533 }
43534 case X86ISD::PSADBW: {
43535 SDValue LHS = Op.getOperand(0);
43536 SDValue RHS = Op.getOperand(1);
43537 assert(VT.getScalarType() == MVT::i64 &&
43538 LHS.getValueType() == RHS.getValueType() &&
43539 LHS.getValueType().getScalarType() == MVT::i8 &&
43540 "Unexpected PSADBW types");
43541
43542 // Aggressively peek through ops to get at the demanded elts.
43543 if (!DemandedElts.isAllOnes()) {
43544 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43545 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43547 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43549 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43550 if (NewLHS || NewRHS) {
43551 NewLHS = NewLHS ? NewLHS : LHS;
43552 NewRHS = NewRHS ? NewRHS : RHS;
43553 return TLO.CombineTo(
43554 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43555 }
43556 }
43557 break;
43558 }
43559 case X86ISD::VSHL:
43560 case X86ISD::VSRL:
43561 case X86ISD::VSRA: {
43562 // We only need the bottom 64-bits of the (128-bit) shift amount.
43563 SDValue Amt = Op.getOperand(1);
43564 MVT AmtVT = Amt.getSimpleValueType();
43565 assert(AmtVT.is128BitVector() && "Unexpected value type");
43566
43567 // If we reuse the shift amount just for sse shift amounts then we know that
43568 // only the bottom 64-bits are only ever used.
43569 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43570 unsigned UseOpc = Use->getOpcode();
43571 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43572 UseOpc == X86ISD::VSRA) &&
43573 Use->getOperand(0) != Amt;
43574 });
43575
43576 APInt AmtUndef, AmtZero;
43577 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43578 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43579 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43580 Depth + 1, AssumeSingleUse))
43581 return true;
43582 [[fallthrough]];
43583 }
43584 case X86ISD::VSHLI:
43585 case X86ISD::VSRLI:
43586 case X86ISD::VSRAI: {
43587 SDValue Src = Op.getOperand(0);
43588 APInt SrcUndef;
43589 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43590 Depth + 1))
43591 return true;
43592
43593 // Fold shift(0,x) -> 0
43594 if (DemandedElts.isSubsetOf(KnownZero))
43595 return TLO.CombineTo(
43596 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43597
43598 // Aggressively peek through ops to get at the demanded elts.
43599 if (!DemandedElts.isAllOnes())
43601 Src, DemandedElts, TLO.DAG, Depth + 1))
43602 return TLO.CombineTo(
43603 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43604 break;
43605 }
43606 case X86ISD::VPSHA:
43607 case X86ISD::VPSHL:
43608 case X86ISD::VSHLV:
43609 case X86ISD::VSRLV:
43610 case X86ISD::VSRAV: {
43611 APInt LHSUndef, LHSZero;
43612 APInt RHSUndef, RHSZero;
43613 SDValue LHS = Op.getOperand(0);
43614 SDValue RHS = Op.getOperand(1);
43615 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43616 Depth + 1))
43617 return true;
43618
43619 // Fold shift(0,x) -> 0
43620 if (DemandedElts.isSubsetOf(LHSZero))
43621 return TLO.CombineTo(
43622 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43623
43624 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43625 Depth + 1))
43626 return true;
43627
43628 KnownZero = LHSZero;
43629 break;
43630 }
43631 case X86ISD::CMPM:
43632 case X86ISD::CMPP: {
43633 // Scalarize packed fp comparison if we only require element 0.
43634 if (DemandedElts == 1) {
43635 SDLoc dl(Op);
43636 MVT VT = Op.getSimpleValueType();
43637 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43638 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43639 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43640 SDValue CC = Op.getOperand(2);
43641 if (Opc == X86ISD::CMPM) {
43642 SDValue Cmp =
43643 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43644 return TLO.CombineTo(
43645 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43646 }
43647 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43648 return TLO.CombineTo(Op,
43649 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43650 }
43651 break;
43652 }
43653 case X86ISD::PCMPEQ:
43654 case X86ISD::PCMPGT: {
43655 APInt LHSUndef, LHSZero;
43656 APInt RHSUndef, RHSZero;
43657 SDValue LHS = Op.getOperand(0);
43658 SDValue RHS = Op.getOperand(1);
43659 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43660 Depth + 1))
43661 return true;
43662 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43663 Depth + 1))
43664 return true;
43665 break;
43666 }
43667 case X86ISD::KSHIFTL: {
43668 SDValue Src = Op.getOperand(0);
43669 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43670 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43671 unsigned ShiftAmt = Amt->getZExtValue();
43672
43673 if (ShiftAmt == 0)
43674 return TLO.CombineTo(Op, Src);
43675
43676 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43677 // single shift. We can do this if the bottom bits (which are shifted
43678 // out) are never demanded.
43679 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43680 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43681 unsigned C1 = Src.getConstantOperandVal(1);
43682 unsigned NewOpc = X86ISD::KSHIFTL;
43683 int Diff = ShiftAmt - C1;
43684 if (Diff < 0) {
43685 Diff = -Diff;
43686 NewOpc = X86ISD::KSHIFTR;
43687 }
43688
43689 SDLoc dl(Op);
43690 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43691 return TLO.CombineTo(
43692 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43693 }
43694 }
43695
43696 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43697 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43698 Depth + 1))
43699 return true;
43700
43701 KnownUndef <<= ShiftAmt;
43702 KnownZero <<= ShiftAmt;
43703 KnownZero.setLowBits(ShiftAmt);
43704 break;
43705 }
43706 case X86ISD::KSHIFTR: {
43707 SDValue Src = Op.getOperand(0);
43708 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43709 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43710 unsigned ShiftAmt = Amt->getZExtValue();
43711
43712 if (ShiftAmt == 0)
43713 return TLO.CombineTo(Op, Src);
43714
43715 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43716 // single shift. We can do this if the top bits (which are shifted
43717 // out) are never demanded.
43718 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43719 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43720 unsigned C1 = Src.getConstantOperandVal(1);
43721 unsigned NewOpc = X86ISD::KSHIFTR;
43722 int Diff = ShiftAmt - C1;
43723 if (Diff < 0) {
43724 Diff = -Diff;
43725 NewOpc = X86ISD::KSHIFTL;
43726 }
43727
43728 SDLoc dl(Op);
43729 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43730 return TLO.CombineTo(
43731 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43732 }
43733 }
43734
43735 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43736 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43737 Depth + 1))
43738 return true;
43739
43740 KnownUndef.lshrInPlace(ShiftAmt);
43741 KnownZero.lshrInPlace(ShiftAmt);
43742 KnownZero.setHighBits(ShiftAmt);
43743 break;
43744 }
43745 case X86ISD::ANDNP: {
43746 // ANDNP = (~LHS & RHS);
43747 SDValue LHS = Op.getOperand(0);
43748 SDValue RHS = Op.getOperand(1);
43749
43750 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43751 APInt UndefElts;
43752 SmallVector<APInt> EltBits;
43753 int NumElts = VT.getVectorNumElements();
43754 int EltSizeInBits = VT.getScalarSizeInBits();
43755 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43756 APInt OpElts = DemandedElts;
43757 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43758 EltBits)) {
43759 OpBits.clearAllBits();
43760 OpElts.clearAllBits();
43761 for (int I = 0; I != NumElts; ++I) {
43762 if (!DemandedElts[I])
43763 continue;
43764 if (UndefElts[I]) {
43765 // We can't assume an undef src element gives an undef dst - the
43766 // other src might be zero.
43767 OpBits.setAllBits();
43768 OpElts.setBit(I);
43769 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43770 (!Invert && !EltBits[I].isZero())) {
43771 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43772 OpElts.setBit(I);
43773 }
43774 }
43775 }
43776 return std::make_pair(OpBits, OpElts);
43777 };
43778 APInt BitsLHS, EltsLHS;
43779 APInt BitsRHS, EltsRHS;
43780 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43781 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43782
43783 APInt LHSUndef, LHSZero;
43784 APInt RHSUndef, RHSZero;
43785 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43786 Depth + 1))
43787 return true;
43788 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43789 Depth + 1))
43790 return true;
43791
43792 if (!DemandedElts.isAllOnes()) {
43793 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43794 TLO.DAG, Depth + 1);
43795 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43796 TLO.DAG, Depth + 1);
43797 if (NewLHS || NewRHS) {
43798 NewLHS = NewLHS ? NewLHS : LHS;
43799 NewRHS = NewRHS ? NewRHS : RHS;
43800 return TLO.CombineTo(
43801 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43802 }
43803 }
43804 break;
43805 }
43806 case X86ISD::CVTSI2P:
43807 case X86ISD::CVTUI2P:
43808 case X86ISD::CVTPH2PS:
43809 case X86ISD::CVTPS2PH: {
43810 SDValue Src = Op.getOperand(0);
43811 EVT SrcVT = Src.getValueType();
43812 APInt SrcUndef, SrcZero;
43813 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43814 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43815 Depth + 1))
43816 return true;
43817 break;
43818 }
43819 case X86ISD::PACKSS:
43820 case X86ISD::PACKUS: {
43821 SDValue N0 = Op.getOperand(0);
43822 SDValue N1 = Op.getOperand(1);
43823
43824 APInt DemandedLHS, DemandedRHS;
43825 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43826
43827 APInt LHSUndef, LHSZero;
43828 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43829 Depth + 1))
43830 return true;
43831 APInt RHSUndef, RHSZero;
43832 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43833 Depth + 1))
43834 return true;
43835
43836 // TODO - pass on known zero/undef.
43837
43838 // Aggressively peek through ops to get at the demanded elts.
43839 // TODO - we should do this for all target/faux shuffles ops.
43840 if (!DemandedElts.isAllOnes()) {
43841 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43842 TLO.DAG, Depth + 1);
43843 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43844 TLO.DAG, Depth + 1);
43845 if (NewN0 || NewN1) {
43846 NewN0 = NewN0 ? NewN0 : N0;
43847 NewN1 = NewN1 ? NewN1 : N1;
43848 return TLO.CombineTo(Op,
43849 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43850 }
43851 }
43852 break;
43853 }
43854 case X86ISD::HADD:
43855 case X86ISD::HSUB:
43856 case X86ISD::FHADD:
43857 case X86ISD::FHSUB: {
43858 SDValue N0 = Op.getOperand(0);
43859 SDValue N1 = Op.getOperand(1);
43860
43861 APInt DemandedLHS, DemandedRHS;
43862 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43863
43864 APInt LHSUndef, LHSZero;
43865 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43866 Depth + 1))
43867 return true;
43868 APInt RHSUndef, RHSZero;
43869 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43870 Depth + 1))
43871 return true;
43872
43873 // TODO - pass on known zero/undef.
43874
43875 // Aggressively peek through ops to get at the demanded elts.
43876 // TODO: Handle repeated operands.
43877 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43878 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43879 TLO.DAG, Depth + 1);
43880 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43881 TLO.DAG, Depth + 1);
43882 if (NewN0 || NewN1) {
43883 NewN0 = NewN0 ? NewN0 : N0;
43884 NewN1 = NewN1 ? NewN1 : N1;
43885 return TLO.CombineTo(Op,
43886 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43887 }
43888 }
43889 break;
43890 }
43891 case X86ISD::VTRUNC:
43892 case X86ISD::VTRUNCS:
43893 case X86ISD::VTRUNCUS: {
43894 SDValue Src = Op.getOperand(0);
43895 MVT SrcVT = Src.getSimpleValueType();
43896 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43897 APInt SrcUndef, SrcZero;
43898 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43899 Depth + 1))
43900 return true;
43901 KnownZero = SrcZero.zextOrTrunc(NumElts);
43902 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43903 break;
43904 }
43905 case X86ISD::BLENDI: {
43906 SmallVector<int, 16> BlendMask;
43907 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43909 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43910 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43911 return TLO.CombineTo(Op, R);
43912 break;
43913 }
43914 case X86ISD::BLENDV: {
43915 APInt SelUndef, SelZero;
43916 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43917 SelZero, TLO, Depth + 1))
43918 return true;
43919
43920 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43921 APInt LHSUndef, LHSZero;
43922 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43923 LHSZero, TLO, Depth + 1))
43924 return true;
43925
43926 APInt RHSUndef, RHSZero;
43927 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43928 RHSZero, TLO, Depth + 1))
43929 return true;
43930
43931 KnownZero = LHSZero & RHSZero;
43932 KnownUndef = LHSUndef & RHSUndef;
43933 break;
43934 }
43935 case X86ISD::VZEXT_MOVL: {
43936 // If upper demanded elements are already zero then we have nothing to do.
43937 SDValue Src = Op.getOperand(0);
43938 APInt DemandedUpperElts = DemandedElts;
43939 DemandedUpperElts.clearLowBits(1);
43940 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43941 return TLO.CombineTo(Op, Src);
43942 break;
43943 }
43944 case X86ISD::VZEXT_LOAD: {
43945 // If upper demanded elements are not demanded then simplify to a
43946 // scalar_to_vector(load()).
43948 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43949 SDLoc DL(Op);
43950 auto *Mem = cast<MemSDNode>(Op);
43951 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43952 Mem->getMemOperand());
43953 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43954 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43955 }
43956 break;
43957 }
43958 case X86ISD::VBROADCAST: {
43959 SDValue Src = Op.getOperand(0);
43960 MVT SrcVT = Src.getSimpleValueType();
43961 // Don't bother broadcasting if we just need the 0'th element.
43962 if (DemandedElts == 1) {
43963 if (!SrcVT.isVector())
43964 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43965 else if (Src.getValueType() != VT)
43966 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43967 SDLoc(Op));
43968 return TLO.CombineTo(Op, Src);
43969 }
43970 if (!SrcVT.isVector())
43971 break;
43972 APInt SrcUndef, SrcZero;
43973 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43974 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43975 Depth + 1))
43976 return true;
43977 // Aggressively peek through src to get at the demanded elt.
43978 // TODO - we should do this for all target/faux shuffles ops.
43980 Src, SrcElts, TLO.DAG, Depth + 1))
43981 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43982 break;
43983 }
43984 case X86ISD::VPERMV:
43985 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43986 Depth))
43987 return true;
43988 break;
43989 case X86ISD::PSHUFB:
43990 case X86ISD::VPERMV3:
43991 case X86ISD::VPERMILPV:
43992 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43993 Depth))
43994 return true;
43995 break;
43996 case X86ISD::VPPERM:
43997 case X86ISD::VPERMIL2:
43998 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43999 Depth))
44000 return true;
44001 break;
44002 }
44003
44004 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44005 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44006 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44007 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44008 DemandedElts.lshr(NumElts / 2) == 0) {
44009 unsigned SizeInBits = VT.getSizeInBits();
44010 unsigned ExtSizeInBits = SizeInBits / 2;
44011
44012 // See if 512-bit ops only use the bottom 128-bits.
44013 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44014 ExtSizeInBits = SizeInBits / 4;
44015
44016 switch (Opc) {
44017 // Scalar broadcast.
44018 case X86ISD::VBROADCAST: {
44019 SDLoc DL(Op);
44020 SDValue Src = Op.getOperand(0);
44021 if (Src.getValueSizeInBits() > ExtSizeInBits)
44022 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44023 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44024 ExtSizeInBits / VT.getScalarSizeInBits());
44025 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44026 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44027 TLO.DAG, DL, ExtSizeInBits));
44028 }
44030 SDLoc DL(Op);
44031 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44032 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44033 ExtSizeInBits / VT.getScalarSizeInBits());
44034 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44035 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44036 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44037 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44038 MemIntr->getMemOperand());
44040 Bcst.getValue(1));
44041 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44042 TLO.DAG, DL, ExtSizeInBits));
44043 }
44044 // Subvector broadcast.
44046 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44047 EVT MemVT = MemIntr->getMemoryVT();
44048 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44049 SDLoc DL(Op);
44050 SDValue Ld =
44051 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44052 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44054 Ld.getValue(1));
44055 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44056 TLO.DAG, DL, ExtSizeInBits));
44057 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44058 SDLoc DL(Op);
44059 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44060 ExtSizeInBits / VT.getScalarSizeInBits());
44061 if (SDValue BcstLd =
44062 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44063 return TLO.CombineTo(Op,
44064 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44065 TLO.DAG, DL, ExtSizeInBits));
44066 }
44067 break;
44068 }
44069 // Byte shifts by immediate.
44070 case X86ISD::VSHLDQ:
44071 case X86ISD::VSRLDQ:
44072 // Shift by uniform.
44073 case X86ISD::VSHL:
44074 case X86ISD::VSRL:
44075 case X86ISD::VSRA:
44076 // Shift by immediate.
44077 case X86ISD::VSHLI:
44078 case X86ISD::VSRLI:
44079 case X86ISD::VSRAI: {
44080 SDLoc DL(Op);
44081 SDValue Ext0 =
44082 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44083 SDValue ExtOp =
44084 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44085 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44086 SDValue Insert =
44087 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44088 return TLO.CombineTo(Op, Insert);
44089 }
44090 case X86ISD::VPERMI: {
44091 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44092 // TODO: This should be done in shuffle combining.
44093 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44095 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44096 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44097 SDLoc DL(Op);
44098 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44099 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44100 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44101 return TLO.CombineTo(Op, Insert);
44102 }
44103 }
44104 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44105 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44106 SDLoc DL(Op);
44107 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44108 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44109 Op.getOperand(1));
44110 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44111 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44112 return TLO.CombineTo(Op, Insert);
44113 }
44114 break;
44115 }
44116 case X86ISD::VPERMV: {
44119 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44120 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44121 VT == MVT::v16f32) &&
44122 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44123 // For lane-crossing shuffles, only split in half in case we're still
44124 // referencing higher elements.
44125 unsigned HalfElts = NumElts / 2;
44126 unsigned HalfSize = SizeInBits / 2;
44127 Mask.resize(HalfElts);
44128 if (all_of(Mask,
44129 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44131 SDLoc DL(Op);
44132 SDValue Ext;
44133 SDValue M =
44134 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44135 SDValue V =
44136 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44137 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44138 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44139 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44140 else {
44142 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44143 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44144 TLO.DAG.getBitcast(ShufVT, V), M);
44145 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44146 }
44147 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44148 Subtarget, TLO.DAG, DL, SizeInBits);
44149 return TLO.CombineTo(Op, Insert);
44150 }
44151 }
44152 break;
44153 }
44154 case X86ISD::VPERMV3: {
44157 if (Subtarget.hasVLX() &&
44158 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44159 // For lane-crossing shuffles, only split in half in case we're still
44160 // referencing higher elements.
44161 unsigned HalfElts = NumElts / 2;
44162 unsigned HalfSize = SizeInBits / 2;
44163 Mask.resize(HalfElts);
44164 if (all_of(Mask, [&](int M) {
44165 return isUndefOrInRange(M, 0, HalfElts) ||
44166 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44167 })) {
44168 // Adjust mask elements for 2nd operand to point to half width.
44169 for (int &M : Mask)
44170 M = (M < NumElts) ? M : (M - HalfElts);
44172 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44173 SDLoc DL(Op);
44174 SDValue Ext = TLO.DAG.getNode(
44175 Opc, DL, HalfVT,
44176 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44177 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44178 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44179 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44180 Subtarget, TLO.DAG, DL, SizeInBits);
44181 return TLO.CombineTo(Op, Insert);
44182 }
44183 }
44184 break;
44185 }
44186 case X86ISD::VPERM2X128: {
44187 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44188 SDLoc DL(Op);
44189 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44190 if (LoMask & 0x8)
44191 return TLO.CombineTo(
44192 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44193 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44194 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44195 SDValue ExtOp =
44196 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44197 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44198 SDValue Insert =
44199 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44200 return TLO.CombineTo(Op, Insert);
44201 }
44202 // Conversions.
44203 // TODO: Add more CVT opcodes when we have test coverage.
44204 case X86ISD::CVTTP2UI: {
44205 if (!Subtarget.hasVLX())
44206 break;
44207 [[fallthrough]];
44208 }
44209 case X86ISD::CVTTP2SI: {
44210 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44211 !Subtarget.hasVLX())
44212 break;
44213 [[fallthrough]];
44214 }
44215 case X86ISD::CVTPH2PS: {
44216 SDLoc DL(Op);
44217 unsigned Scale = SizeInBits / ExtSizeInBits;
44218 SDValue SrcOp = Op.getOperand(0);
44219 MVT SrcVT = SrcOp.getSimpleValueType();
44220 unsigned SrcExtSize =
44221 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44223 ExtSizeInBits / VT.getScalarSizeInBits());
44224 SDValue ExtOp = TLO.DAG.getNode(
44225 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44226 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44227 SDValue Insert =
44228 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44229 return TLO.CombineTo(Op, Insert);
44230 }
44231 // Zero upper elements.
44232 case X86ISD::VZEXT_MOVL:
44233 // Variable blend.
44234 case X86ISD::BLENDV:
44235 // Target unary shuffles:
44236 case X86ISD::MOVDDUP:
44237 // Target unary shuffles by immediate:
44238 case X86ISD::PSHUFD:
44239 case X86ISD::PSHUFLW:
44240 case X86ISD::PSHUFHW:
44241 case X86ISD::VPERMILPI:
44242 // (Non-Lane Crossing) Target Shuffles.
44243 case X86ISD::VPERMILPV:
44244 case X86ISD::VPERMIL2:
44245 case X86ISD::PSHUFB:
44246 case X86ISD::UNPCKL:
44247 case X86ISD::UNPCKH:
44248 case X86ISD::BLENDI:
44249 // Integer ops.
44250 case X86ISD::PACKSS:
44251 case X86ISD::PACKUS:
44252 case X86ISD::PCMPEQ:
44253 case X86ISD::PCMPGT:
44254 case X86ISD::PMULUDQ:
44255 case X86ISD::PMULDQ:
44256 case X86ISD::VSHLV:
44257 case X86ISD::VSRLV:
44258 case X86ISD::VSRAV:
44259 // Float ops.
44260 case X86ISD::FMAX:
44261 case X86ISD::FMIN:
44262 case X86ISD::FMAXC:
44263 case X86ISD::FMINC:
44264 case X86ISD::FRSQRT:
44265 case X86ISD::FRCP:
44266 // Horizontal Ops.
44267 case X86ISD::HADD:
44268 case X86ISD::HSUB:
44269 case X86ISD::FHADD:
44270 case X86ISD::FHSUB: {
44271 SDLoc DL(Op);
44273 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44274 SDValue SrcOp = Op.getOperand(i);
44275 EVT SrcVT = SrcOp.getValueType();
44276 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44277 "Unsupported vector size");
44278 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44279 ExtSizeInBits)
44280 : SrcOp);
44281 }
44282 MVT ExtVT = VT.getSimpleVT();
44283 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44284 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44285 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44286 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44287 SDValue Insert =
44288 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44289 return TLO.CombineTo(Op, Insert);
44290 }
44291 }
44292 }
44293
44294 // For splats, unless we *only* demand the 0'th element,
44295 // stop attempts at simplification here, we aren't going to improve things,
44296 // this is better than any potential shuffle.
44297 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44298 return false;
44299
44300 // Get target/faux shuffle mask.
44301 APInt OpUndef, OpZero;
44302 SmallVector<int, 64> OpMask;
44303 SmallVector<SDValue, 2> OpInputs;
44304 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44305 OpZero, TLO.DAG, Depth, false))
44306 return false;
44307
44308 // Shuffle inputs must be the same size as the result.
44309 if (OpMask.size() != (unsigned)NumElts ||
44310 llvm::any_of(OpInputs, [VT](SDValue V) {
44311 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44312 !V.getValueType().isVector();
44313 }))
44314 return false;
44315
44316 KnownZero = OpZero;
44317 KnownUndef = OpUndef;
44318
44319 // Check if shuffle mask can be simplified to undef/zero/identity.
44320 int NumSrcs = OpInputs.size();
44321 for (int i = 0; i != NumElts; ++i)
44322 if (!DemandedElts[i])
44323 OpMask[i] = SM_SentinelUndef;
44324
44325 if (isUndefInRange(OpMask, 0, NumElts)) {
44326 KnownUndef.setAllBits();
44327 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44328 }
44329 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44330 KnownZero.setAllBits();
44331 return TLO.CombineTo(
44332 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44333 }
44334 for (int Src = 0; Src != NumSrcs; ++Src)
44335 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44336 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44337
44338 // Attempt to simplify inputs.
44339 for (int Src = 0; Src != NumSrcs; ++Src) {
44340 // TODO: Support inputs of different types.
44341 if (OpInputs[Src].getValueType() != VT)
44342 continue;
44343
44344 int Lo = Src * NumElts;
44345 APInt SrcElts = APInt::getZero(NumElts);
44346 for (int i = 0; i != NumElts; ++i)
44347 if (DemandedElts[i]) {
44348 int M = OpMask[i] - Lo;
44349 if (0 <= M && M < NumElts)
44350 SrcElts.setBit(M);
44351 }
44352
44353 // TODO - Propagate input undef/zero elts.
44354 APInt SrcUndef, SrcZero;
44355 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44356 TLO, Depth + 1))
44357 return true;
44358 }
44359
44360 // If we don't demand all elements, then attempt to combine to a simpler
44361 // shuffle.
44362 // We need to convert the depth to something combineX86ShufflesRecursively
44363 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44364 // to match. This prevents combineX86ShuffleChain from returning a
44365 // combined shuffle that's the same as the original root, causing an
44366 // infinite loop.
44367 if (!DemandedElts.isAllOnes()) {
44368 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44369
44370 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44371 for (int i = 0; i != NumElts; ++i)
44372 if (DemandedElts[i])
44373 DemandedMask[i] = i;
44374
44376 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44378 /*AllowVariableCrossLaneMask=*/true,
44379 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44380 TLO.DAG, SDLoc(Op), Subtarget);
44381 if (NewShuffle)
44382 return TLO.CombineTo(Op, NewShuffle);
44383 }
44384
44385 return false;
44386}
44387
44389 SDValue Op, const APInt &OriginalDemandedBits,
44390 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44391 unsigned Depth) const {
44392 EVT VT = Op.getValueType();
44393 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44394 unsigned Opc = Op.getOpcode();
44395 switch(Opc) {
44396 case X86ISD::VTRUNC: {
44397 KnownBits KnownOp;
44398 SDValue Src = Op.getOperand(0);
44399 MVT SrcVT = Src.getSimpleValueType();
44400
44401 // Simplify the input, using demanded bit information.
44402 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44403 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44404 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44405 return true;
44406 break;
44407 }
44408 case X86ISD::PMULDQ:
44409 case X86ISD::PMULUDQ: {
44410 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44411 KnownBits KnownLHS, KnownRHS;
44412 SDValue LHS = Op.getOperand(0);
44413 SDValue RHS = Op.getOperand(1);
44414
44415 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44416 // FIXME: Can we bound this better?
44417 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44418 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44419 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44420
44421 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44422 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44423 DemandedMaskLHS = DemandedMask;
44424 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44425 DemandedMaskRHS = DemandedMask;
44426
44427 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44428 KnownLHS, TLO, Depth + 1))
44429 return true;
44430 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44431 KnownRHS, TLO, Depth + 1))
44432 return true;
44433
44434 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44435 KnownRHS = KnownRHS.trunc(32);
44436 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44437 KnownRHS.getConstant().isOne()) {
44438 SDLoc DL(Op);
44439 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44440 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44441 }
44442
44443 // Aggressively peek through ops to get at the demanded low bits.
44445 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44447 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44448 if (DemandedLHS || DemandedRHS) {
44449 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44450 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44451 return TLO.CombineTo(
44452 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44453 }
44454 break;
44455 }
44456 case X86ISD::ANDNP: {
44457 KnownBits Known2;
44458 SDValue Op0 = Op.getOperand(0);
44459 SDValue Op1 = Op.getOperand(1);
44460
44461 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44462 Known, TLO, Depth + 1))
44463 return true;
44464
44465 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44466 OriginalDemandedElts, Known2, TLO, Depth + 1))
44467 return true;
44468
44469 // If the RHS is a constant, see if we can simplify it.
44470 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44471 OriginalDemandedElts, TLO))
44472 return true;
44473
44474 // ANDNP = (~Op0 & Op1);
44475 Known.One &= Known2.Zero;
44476 Known.Zero |= Known2.One;
44477 break;
44478 }
44479 case X86ISD::VSHLI: {
44480 SDValue Op0 = Op.getOperand(0);
44481 SDValue Op1 = Op.getOperand(1);
44482
44483 unsigned ShAmt = Op1->getAsZExtVal();
44484 if (ShAmt >= BitWidth)
44485 break;
44486
44487 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44488
44489 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44490 // single shift. We can do this if the bottom bits (which are shifted
44491 // out) are never demanded.
44492 if (Op0.getOpcode() == X86ISD::VSRLI &&
44493 OriginalDemandedBits.countr_zero() >= ShAmt) {
44494 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44495 if (Shift2Amt < BitWidth) {
44496 int Diff = ShAmt - Shift2Amt;
44497 if (Diff == 0)
44498 return TLO.CombineTo(Op, Op0.getOperand(0));
44499
44500 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44501 SDValue NewShift = TLO.DAG.getNode(
44502 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44503 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44504 return TLO.CombineTo(Op, NewShift);
44505 }
44506 }
44507
44508 // If we are only demanding sign bits then we can use the shift source directly.
44509 unsigned NumSignBits =
44510 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44511 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44512 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44513 return TLO.CombineTo(Op, Op0);
44514
44515 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44516 TLO, Depth + 1))
44517 return true;
44518
44519 Known <<= ShAmt;
44520
44521 // Low bits known zero.
44522 Known.Zero.setLowBits(ShAmt);
44523
44524 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44525 // Attempt to avoid multi-use ops if we don't need anything from them.
44526 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44527 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44528 SDValue NewOp =
44529 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44530 return TLO.CombineTo(Op, NewOp);
44531 }
44532 }
44533 return false;
44534 }
44535 case X86ISD::VSRLI: {
44536 SDValue Op0 = Op.getOperand(0);
44537 SDValue Op1 = Op.getOperand(1);
44538
44539 unsigned ShAmt = Op1->getAsZExtVal();
44540 if (ShAmt >= BitWidth)
44541 break;
44542
44543 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44544
44545 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44546 TLO, Depth + 1))
44547 return true;
44548
44549 Known >>= ShAmt;
44550
44551 // High bits known zero.
44552 Known.Zero.setHighBits(ShAmt);
44553
44554 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44555 // Attempt to avoid multi-use ops if we don't need anything from them.
44556 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44557 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44558 SDValue NewOp =
44559 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44560 return TLO.CombineTo(Op, NewOp);
44561 }
44562 }
44563 return false;
44564 }
44565 case X86ISD::VSRAI: {
44566 SDValue Op0 = Op.getOperand(0);
44567 SDValue Op1 = Op.getOperand(1);
44568
44569 unsigned ShAmt = Op1->getAsZExtVal();
44570 if (ShAmt >= BitWidth)
44571 break;
44572
44573 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44574
44575 // If we just want the sign bit then we don't need to shift it.
44576 if (OriginalDemandedBits.isSignMask())
44577 return TLO.CombineTo(Op, Op0);
44578
44579 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44580 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44581 SDValue Op00 = Op0.getOperand(0);
44582 unsigned NumSignBits =
44583 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44584 if (ShAmt < NumSignBits)
44585 return TLO.CombineTo(Op, Op00);
44586 }
44587
44588 // If any of the demanded bits are produced by the sign extension, we also
44589 // demand the input sign bit.
44590 if (OriginalDemandedBits.countl_zero() < ShAmt)
44591 DemandedMask.setSignBit();
44592
44593 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44594 TLO, Depth + 1))
44595 return true;
44596
44597 Known >>= ShAmt;
44598
44599 // If the input sign bit is known to be zero, or if none of the top bits
44600 // are demanded, turn this into an unsigned shift right.
44601 if (Known.Zero[BitWidth - ShAmt - 1] ||
44602 OriginalDemandedBits.countl_zero() >= ShAmt)
44603 return TLO.CombineTo(
44604 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44605
44606 // High bits are known one.
44607 if (Known.One[BitWidth - ShAmt - 1])
44608 Known.One.setHighBits(ShAmt);
44609
44610 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44611 // Attempt to avoid multi-use ops if we don't need anything from them.
44612 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44613 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44614 SDValue NewOp =
44615 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44616 return TLO.CombineTo(Op, NewOp);
44617 }
44618 }
44619 return false;
44620 }
44621 case X86ISD::BLENDI: {
44622 SDValue LHS = Op.getOperand(0);
44623 SDValue RHS = Op.getOperand(1);
44624 APInt Mask = getBLENDIBlendMask(Op);
44625
44626 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44627 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44628 TLO, Depth + 1))
44629 return true;
44630
44631 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44632 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44633 TLO, Depth + 1))
44634 return true;
44635
44636 // Attempt to avoid multi-use ops if we don't need anything from them.
44638 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44640 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44641 if (NewLHS || NewRHS) {
44642 NewLHS = NewLHS ? NewLHS : LHS;
44643 NewRHS = NewRHS ? NewRHS : RHS;
44644 return TLO.CombineTo(Op,
44645 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44646 NewLHS, NewRHS, Op.getOperand(2)));
44647 }
44648 break;
44649 }
44650 case X86ISD::BLENDV: {
44651 SDValue Sel = Op.getOperand(0);
44652 SDValue LHS = Op.getOperand(1);
44653 SDValue RHS = Op.getOperand(2);
44654
44655 APInt SignMask = APInt::getSignMask(BitWidth);
44657 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44659 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44661 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44662
44663 if (NewSel || NewLHS || NewRHS) {
44664 NewSel = NewSel ? NewSel : Sel;
44665 NewLHS = NewLHS ? NewLHS : LHS;
44666 NewRHS = NewRHS ? NewRHS : RHS;
44667 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44668 NewSel, NewLHS, NewRHS));
44669 }
44670 break;
44671 }
44672 case X86ISD::PEXTRB:
44673 case X86ISD::PEXTRW: {
44674 SDValue Vec = Op.getOperand(0);
44675 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44676 MVT VecVT = Vec.getSimpleValueType();
44677 unsigned NumVecElts = VecVT.getVectorNumElements();
44678
44679 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44680 unsigned Idx = CIdx->getZExtValue();
44681 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44682
44683 // If we demand no bits from the vector then we must have demanded
44684 // bits from the implict zext - simplify to zero.
44685 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44686 if (DemandedVecBits == 0)
44687 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44688
44689 APInt KnownUndef, KnownZero;
44690 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44691 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44692 KnownZero, TLO, Depth + 1))
44693 return true;
44694
44695 KnownBits KnownVec;
44696 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44697 KnownVec, TLO, Depth + 1))
44698 return true;
44699
44701 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44702 return TLO.CombineTo(
44703 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44704
44705 Known = KnownVec.zext(BitWidth);
44706 return false;
44707 }
44708 break;
44709 }
44710 case X86ISD::PINSRB:
44711 case X86ISD::PINSRW: {
44712 SDValue Vec = Op.getOperand(0);
44713 SDValue Scl = Op.getOperand(1);
44714 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44715 MVT VecVT = Vec.getSimpleValueType();
44716
44717 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44718 unsigned Idx = CIdx->getZExtValue();
44719 if (!OriginalDemandedElts[Idx])
44720 return TLO.CombineTo(Op, Vec);
44721
44722 KnownBits KnownVec;
44723 APInt DemandedVecElts(OriginalDemandedElts);
44724 DemandedVecElts.clearBit(Idx);
44725 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44726 KnownVec, TLO, Depth + 1))
44727 return true;
44728
44729 KnownBits KnownScl;
44730 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44731 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44732 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44733 return true;
44734
44735 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44736 Known = KnownVec.intersectWith(KnownScl);
44737 return false;
44738 }
44739 break;
44740 }
44741 case X86ISD::PACKSS:
44742 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44743 // sign bit then we can just ask for the source operands sign bit.
44744 // TODO - add known bits handling.
44745 if (OriginalDemandedBits.isSignMask()) {
44746 APInt DemandedLHS, DemandedRHS;
44747 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44748
44749 KnownBits KnownLHS, KnownRHS;
44750 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44751 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44752 KnownLHS, TLO, Depth + 1))
44753 return true;
44754 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44755 KnownRHS, TLO, Depth + 1))
44756 return true;
44757
44758 // Attempt to avoid multi-use ops if we don't need anything from them.
44760 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44762 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44763 if (DemandedOp0 || DemandedOp1) {
44764 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44765 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44766 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44767 }
44768 }
44769 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44770 break;
44771 case X86ISD::VBROADCAST: {
44772 SDValue Src = Op.getOperand(0);
44773 MVT SrcVT = Src.getSimpleValueType();
44774 APInt DemandedElts = APInt::getOneBitSet(
44775 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44776 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44777 TLO, Depth + 1))
44778 return true;
44779 // If we don't need the upper bits, attempt to narrow the broadcast source.
44780 // Don't attempt this on AVX512 as it might affect broadcast folding.
44781 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44782 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44783 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44784 Src->hasOneUse()) {
44785 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44786 SDValue NewSrc =
44787 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44788 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44789 SDValue NewBcst =
44790 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44791 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44792 }
44793 break;
44794 }
44795 case X86ISD::PCMPGT:
44796 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44797 // iff we only need the sign bit then we can use R directly.
44798 if (OriginalDemandedBits.isSignMask() &&
44799 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44800 return TLO.CombineTo(Op, Op.getOperand(1));
44801 break;
44802 case X86ISD::MOVMSK: {
44803 SDValue Src = Op.getOperand(0);
44804 MVT SrcVT = Src.getSimpleValueType();
44805 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44806 unsigned NumElts = SrcVT.getVectorNumElements();
44807
44808 // If we don't need the sign bits at all just return zero.
44809 if (OriginalDemandedBits.countr_zero() >= NumElts)
44810 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44811
44812 // See if we only demand bits from the lower 128-bit vector.
44813 if (SrcVT.is256BitVector() &&
44814 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44815 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44816 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44817 }
44818
44819 // Only demand the vector elements of the sign bits we need.
44820 APInt KnownUndef, KnownZero;
44821 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44822 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44823 TLO, Depth + 1))
44824 return true;
44825
44826 Known.Zero = KnownZero.zext(BitWidth);
44827 Known.Zero.setHighBits(BitWidth - NumElts);
44828
44829 // MOVMSK only uses the MSB from each vector element.
44830 KnownBits KnownSrc;
44831 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44832 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44833 Depth + 1))
44834 return true;
44835
44836 if (KnownSrc.One[SrcBits - 1])
44837 Known.One.setLowBits(NumElts);
44838 else if (KnownSrc.Zero[SrcBits - 1])
44839 Known.Zero.setLowBits(NumElts);
44840
44841 // Attempt to avoid multi-use os if we don't need anything from it.
44843 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44844 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44845 return false;
44846 }
44847 case X86ISD::TESTP: {
44848 SDValue Op0 = Op.getOperand(0);
44849 SDValue Op1 = Op.getOperand(1);
44850 MVT OpVT = Op0.getSimpleValueType();
44851 assert((OpVT.getVectorElementType() == MVT::f32 ||
44852 OpVT.getVectorElementType() == MVT::f64) &&
44853 "Illegal vector type for X86ISD::TESTP");
44854
44855 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44856 KnownBits KnownSrc;
44857 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44858 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44859 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44860 AssumeSingleUse) ||
44861 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44862 AssumeSingleUse);
44863 }
44864 case X86ISD::CMOV: {
44865 KnownBits Known2;
44866 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44867 OriginalDemandedElts, Known2, TLO, Depth + 1))
44868 return true;
44869 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44870 OriginalDemandedElts, Known, TLO, Depth + 1))
44871 return true;
44872
44873 // Only known if known in both the LHS and RHS.
44874 Known = Known.intersectWith(Known2);
44875 return false;
44876 }
44877 case X86ISD::BEXTR:
44878 case X86ISD::BEXTRI: {
44879 SDValue Op0 = Op.getOperand(0);
44880 SDValue Op1 = Op.getOperand(1);
44881
44882 // Only bottom 16-bits of the control bits are required.
44883 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44884 // NOTE: SimplifyDemandedBits won't do this for constants.
44885 uint64_t Val1 = Cst1->getZExtValue();
44886 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44887 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44888 SDLoc DL(Op);
44889 return TLO.CombineTo(
44890 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44891 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44892 }
44893
44894 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44895 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44896
44897 // If the length is 0, the result is 0.
44898 if (Length == 0) {
44899 Known.setAllZero();
44900 return false;
44901 }
44902
44903 if ((Shift + Length) <= BitWidth) {
44904 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44905 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44906 return true;
44907
44908 Known = Known.extractBits(Length, Shift);
44909 Known = Known.zextOrTrunc(BitWidth);
44910 return false;
44911 }
44912 } else {
44913 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44914 KnownBits Known1;
44915 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44916 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44917 return true;
44918
44919 // If the length is 0, replace with 0.
44920 KnownBits LengthBits = Known1.extractBits(8, 8);
44921 if (LengthBits.isZero())
44922 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44923 }
44924
44925 break;
44926 }
44927 case X86ISD::PDEP: {
44928 SDValue Op0 = Op.getOperand(0);
44929 SDValue Op1 = Op.getOperand(1);
44930
44931 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44932 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44933
44934 // If the demanded bits has leading zeroes, we don't demand those from the
44935 // mask.
44936 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44937 return true;
44938
44939 // The number of possible 1s in the mask determines the number of LSBs of
44940 // operand 0 used. Undemanded bits from the mask don't matter so filter
44941 // them before counting.
44942 KnownBits Known2;
44943 uint64_t Count = (~Known.Zero & LoMask).popcount();
44944 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44945 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44946 return true;
44947
44948 // Zeroes are retained from the mask, but not ones.
44949 Known.One.clearAllBits();
44950 // The result will have at least as many trailing zeros as the non-mask
44951 // operand since bits can only map to the same or higher bit position.
44952 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44953 return false;
44954 }
44955 case X86ISD::VPMADD52L:
44956 case X86ISD::VPMADD52H: {
44957 KnownBits KnownOp0, KnownOp1;
44958 SDValue Op0 = Op.getOperand(0);
44959 SDValue Op1 = Op.getOperand(1);
44960 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
44961 // operand 2).
44962 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
44963 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
44964 TLO, Depth + 1))
44965 return true;
44966
44967 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
44968 TLO, Depth + 1))
44969 return true;
44970 // TODO: Compute the known bits for VPMADD52L/VPMADD52H.
44971 break;
44972 }
44973 }
44974
44976 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44977}
44978
44980 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
44981 SelectionDAG &DAG, unsigned Depth) const {
44982 int NumElts = DemandedElts.getBitWidth();
44983 unsigned Opc = Op.getOpcode();
44984 EVT VT = Op.getValueType();
44985
44986 switch (Opc) {
44987 case X86ISD::PINSRB:
44988 case X86ISD::PINSRW: {
44989 // If we don't demand the inserted element, return the base vector.
44990 SDValue Vec = Op.getOperand(0);
44991 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44992 MVT VecVT = Vec.getSimpleValueType();
44993 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44994 !DemandedElts[CIdx->getZExtValue()])
44995 return Vec;
44996 break;
44997 }
44998 case X86ISD::VSHLI: {
44999 // If we are only demanding sign bits then we can use the shift source
45000 // directly.
45001 SDValue Op0 = Op.getOperand(0);
45002 unsigned ShAmt = Op.getConstantOperandVal(1);
45003 unsigned BitWidth = DemandedBits.getBitWidth();
45004 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45005 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45006 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45007 return Op0;
45008 break;
45009 }
45010 case X86ISD::VSRAI:
45011 // iff we only need the sign bit then we can use the source directly.
45012 // TODO: generalize where we only demand extended signbits.
45013 if (DemandedBits.isSignMask())
45014 return Op.getOperand(0);
45015 break;
45016 case X86ISD::PCMPGT:
45017 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45018 // iff we only need the sign bit then we can use R directly.
45019 if (DemandedBits.isSignMask() &&
45020 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45021 return Op.getOperand(1);
45022 break;
45023 case X86ISD::BLENDV: {
45024 // BLENDV: Cond (MSB) ? LHS : RHS
45025 SDValue Cond = Op.getOperand(0);
45026 SDValue LHS = Op.getOperand(1);
45027 SDValue RHS = Op.getOperand(2);
45028
45029 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45030 if (CondKnown.isNegative())
45031 return LHS;
45032 if (CondKnown.isNonNegative())
45033 return RHS;
45034 break;
45035 }
45036 case X86ISD::ANDNP: {
45037 // ANDNP = (~LHS & RHS);
45038 SDValue LHS = Op.getOperand(0);
45039 SDValue RHS = Op.getOperand(1);
45040
45041 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45042 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45043
45044 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45045 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45046 // this context, so return RHS.
45047 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45048 return RHS;
45049 break;
45050 }
45051 }
45052
45053 APInt ShuffleUndef, ShuffleZero;
45054 SmallVector<int, 16> ShuffleMask;
45056 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45057 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45058 // If all the demanded elts are from one operand and are inline,
45059 // then we can use the operand directly.
45060 int NumOps = ShuffleOps.size();
45061 if (ShuffleMask.size() == (unsigned)NumElts &&
45063 return VT.getSizeInBits() == V.getValueSizeInBits();
45064 })) {
45065
45066 if (DemandedElts.isSubsetOf(ShuffleUndef))
45067 return DAG.getUNDEF(VT);
45068 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45069 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45070
45071 // Bitmask that indicates which ops have only been accessed 'inline'.
45072 APInt IdentityOp = APInt::getAllOnes(NumOps);
45073 for (int i = 0; i != NumElts; ++i) {
45074 int M = ShuffleMask[i];
45075 if (!DemandedElts[i] || ShuffleUndef[i])
45076 continue;
45077 int OpIdx = M / NumElts;
45078 int EltIdx = M % NumElts;
45079 if (M < 0 || EltIdx != i) {
45080 IdentityOp.clearAllBits();
45081 break;
45082 }
45083 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45084 if (IdentityOp == 0)
45085 break;
45086 }
45087 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45088 "Multiple identity shuffles detected");
45089
45090 if (IdentityOp != 0)
45091 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45092 }
45093 }
45094
45096 Op, DemandedBits, DemandedElts, DAG, Depth);
45097}
45098
45100 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45101 bool PoisonOnly, unsigned Depth) const {
45102 unsigned NumElts = DemandedElts.getBitWidth();
45103
45104 switch (Op.getOpcode()) {
45106 case X86ISD::Wrapper:
45107 case X86ISD::WrapperRIP:
45108 return true;
45109 case X86ISD::BLENDI:
45110 case X86ISD::PSHUFD:
45111 case X86ISD::UNPCKL:
45112 case X86ISD::UNPCKH:
45113 case X86ISD::VPERMILPI:
45114 case X86ISD::VPERMV3: {
45117 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45118 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45119 APInt::getZero(NumElts));
45120 for (auto M : enumerate(Mask)) {
45121 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45122 continue;
45123 if (M.value() == SM_SentinelUndef)
45124 return false;
45125 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45126 "Shuffle mask index out of range");
45127 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45128 }
45129 for (auto Op : enumerate(Ops))
45130 if (!DemandedSrcElts[Op.index()].isZero() &&
45132 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45133 return false;
45134 return true;
45135 }
45136 break;
45137 }
45138 }
45140 Op, DemandedElts, DAG, PoisonOnly, Depth);
45141}
45142
45144 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45145 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45146
45147 switch (Op.getOpcode()) {
45148 // SSE bit logic.
45149 case X86ISD::FAND:
45150 case X86ISD::FOR:
45151 case X86ISD::FXOR:
45152 case X86ISD::FANDN:
45153 case X86ISD::ANDNP:
45154 case X86ISD::VPTERNLOG:
45155 return false;
45156 // SSE vector insert/extracts use modulo indices.
45157 case X86ISD::PINSRB:
45158 case X86ISD::PINSRW:
45159 case X86ISD::PEXTRB:
45160 case X86ISD::PEXTRW:
45161 return false;
45162 // SSE vector multiplies are either inbounds or saturate.
45163 case X86ISD::VPMADDUBSW:
45164 case X86ISD::VPMADDWD:
45165 return false;
45166 // SSE vector shifts handle out of bounds shift amounts.
45167 case X86ISD::VSHLI:
45168 case X86ISD::VSRLI:
45169 case X86ISD::VSRAI:
45170 return false;
45171 // SSE blends.
45172 case X86ISD::BLENDI:
45173 case X86ISD::BLENDV:
45174 return false;
45175 // SSE target shuffles.
45176 case X86ISD::PSHUFD:
45177 case X86ISD::UNPCKL:
45178 case X86ISD::UNPCKH:
45179 case X86ISD::VPERMILPI:
45180 case X86ISD::VPERMV3:
45181 return false;
45182 // SSE comparisons handle all icmp/fcmp cases.
45183 // TODO: Add CMPM/MM with test coverage.
45184 case X86ISD::CMPP:
45185 case X86ISD::PCMPEQ:
45186 case X86ISD::PCMPGT:
45187 return false;
45188 // SSE signbit extraction.
45189 case X86ISD::MOVMSK:
45190 return false;
45191 // GFNI instructions.
45194 case X86ISD::GF2P8MULB:
45195 return false;
45197 switch (Op->getConstantOperandVal(0)) {
45198 case Intrinsic::x86_sse2_pmadd_wd:
45199 case Intrinsic::x86_avx2_pmadd_wd:
45200 case Intrinsic::x86_avx512_pmaddw_d_512:
45201 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45202 case Intrinsic::x86_avx2_pmadd_ub_sw:
45203 case Intrinsic::x86_avx512_pmaddubs_w_512:
45204 return false;
45205 case Intrinsic::x86_avx512_vpermi2var_d_128:
45206 case Intrinsic::x86_avx512_vpermi2var_d_256:
45207 case Intrinsic::x86_avx512_vpermi2var_d_512:
45208 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45209 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45210 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45211 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45212 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45213 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45214 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45215 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45216 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45217 case Intrinsic::x86_avx512_vpermi2var_q_128:
45218 case Intrinsic::x86_avx512_vpermi2var_q_256:
45219 case Intrinsic::x86_avx512_vpermi2var_q_512:
45220 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45221 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45222 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45223 return false;
45224 }
45225 }
45227 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45228}
45229
45231 const APInt &DemandedElts,
45232 APInt &UndefElts,
45233 const SelectionDAG &DAG,
45234 unsigned Depth) const {
45235 unsigned NumElts = DemandedElts.getBitWidth();
45236 unsigned Opc = Op.getOpcode();
45237
45238 switch (Opc) {
45239 case X86ISD::VBROADCAST:
45241 UndefElts = APInt::getZero(NumElts);
45242 return true;
45243 }
45244
45245 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45246 DAG, Depth);
45247}
45248
45249// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45250// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45251static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45252 bool AllowTruncate, unsigned Depth) {
45253 // Limit recursion.
45255 return false;
45256 switch (Src.getOpcode()) {
45257 case ISD::TRUNCATE:
45258 if (!AllowTruncate)
45259 return false;
45260 [[fallthrough]];
45261 case ISD::SETCC:
45262 return Src.getOperand(0).getValueSizeInBits() == Size;
45263 case ISD::FREEZE:
45264 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45265 Depth + 1);
45266 case ISD::AND:
45267 case ISD::XOR:
45268 case ISD::OR:
45269 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45270 Depth + 1) &&
45271 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45272 Depth + 1);
45273 case ISD::SELECT:
45274 case ISD::VSELECT:
45275 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45276 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45277 Depth + 1) &&
45278 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45279 Depth + 1);
45280 case ISD::BUILD_VECTOR:
45281 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45282 ISD::isBuildVectorAllOnes(Src.getNode());
45283 }
45284 return false;
45285}
45286
45287// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45288static unsigned getAltBitOpcode(unsigned Opcode) {
45289 switch(Opcode) {
45290 // clang-format off
45291 case ISD::AND: return X86ISD::FAND;
45292 case ISD::OR: return X86ISD::FOR;
45293 case ISD::XOR: return X86ISD::FXOR;
45294 case X86ISD::ANDNP: return X86ISD::FANDN;
45295 // clang-format on
45296 }
45297 llvm_unreachable("Unknown bitwise opcode");
45298}
45299
45300// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45302 const SDLoc &DL) {
45303 EVT SrcVT = Src.getValueType();
45304 if (SrcVT != MVT::v4i1)
45305 return SDValue();
45306
45307 switch (Src.getOpcode()) {
45308 case ISD::SETCC:
45309 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45310 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45311 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45312 SDValue Op0 = Src.getOperand(0);
45313 if (ISD::isNormalLoad(Op0.getNode()))
45314 return DAG.getBitcast(MVT::v4f32, Op0);
45315 if (Op0.getOpcode() == ISD::BITCAST &&
45316 Op0.getOperand(0).getValueType() == MVT::v4f32)
45317 return Op0.getOperand(0);
45318 }
45319 break;
45320 case ISD::AND:
45321 case ISD::XOR:
45322 case ISD::OR: {
45323 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45324 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45325 if (Op0 && Op1)
45326 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45327 Op1);
45328 break;
45329 }
45330 }
45331 return SDValue();
45332}
45333
45334// Helper to push sign extension of vXi1 SETCC result through bitops.
45336 SDValue Src, const SDLoc &DL) {
45337 switch (Src.getOpcode()) {
45338 case ISD::SETCC:
45339 case ISD::FREEZE:
45340 case ISD::TRUNCATE:
45341 case ISD::BUILD_VECTOR:
45342 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45343 case ISD::AND:
45344 case ISD::XOR:
45345 case ISD::OR:
45346 return DAG.getNode(
45347 Src.getOpcode(), DL, SExtVT,
45348 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45349 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45350 case ISD::SELECT:
45351 case ISD::VSELECT:
45352 return DAG.getSelect(
45353 DL, SExtVT, Src.getOperand(0),
45354 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45355 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45356 }
45357 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45358}
45359
45360// Try to match patterns such as
45361// (i16 bitcast (v16i1 x))
45362// ->
45363// (i16 movmsk (16i8 sext (v16i1 x)))
45364// before the illegal vector is scalarized on subtargets that don't have legal
45365// vxi1 types.
45367 const SDLoc &DL,
45368 const X86Subtarget &Subtarget) {
45369 EVT SrcVT = Src.getValueType();
45370 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45371 return SDValue();
45372
45373 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45374 // legalization destroys the v4i32 type.
45375 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45376 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45377 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45378 DAG.getBitcast(MVT::v4f32, V));
45379 return DAG.getZExtOrTrunc(V, DL, VT);
45380 }
45381 }
45382
45383 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45384 // movmskb even with avx512. This will be better than truncating to vXi1 and
45385 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45386 // vpcmpeqb/vpcmpgtb.
45387 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45388 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45389 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45390 Src.getOperand(0).getValueType() == MVT::v64i8);
45391
45392 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45393 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45394 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45395 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45396 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45397 EVT CmpVT = Src.getOperand(0).getValueType();
45398 EVT EltVT = CmpVT.getVectorElementType();
45399 if (CmpVT.getSizeInBits() <= 256 &&
45400 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45401 PreferMovMsk = true;
45402 }
45403
45404 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45405 // MOVMSK is supported in SSE2 or later.
45406 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45407 return SDValue();
45408
45409 // If the upper ops of a concatenation are undef, then try to bitcast the
45410 // lower op and extend.
45411 SmallVector<SDValue, 4> SubSrcOps;
45412 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45413 SubSrcOps.size() >= 2) {
45414 SDValue LowerOp = SubSrcOps[0];
45415 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45416 if (LowerOp.getOpcode() == ISD::SETCC &&
45417 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45418 EVT SubVT = VT.getIntegerVT(
45419 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45420 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45421 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45422 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45423 }
45424 }
45425 }
45426
45427 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45428 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45429 // v8i16 and v16i16.
45430 // For these two cases, we can shuffle the upper element bytes to a
45431 // consecutive sequence at the start of the vector and treat the results as
45432 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45433 // for v16i16 this is not the case, because the shuffle is expensive, so we
45434 // avoid sign-extending to this type entirely.
45435 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45436 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45437 MVT SExtVT;
45438 bool PropagateSExt = false;
45439 switch (SrcVT.getSimpleVT().SimpleTy) {
45440 default:
45441 return SDValue();
45442 case MVT::v2i1:
45443 SExtVT = MVT::v2i64;
45444 break;
45445 case MVT::v4i1:
45446 SExtVT = MVT::v4i32;
45447 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45448 // sign-extend to a 256-bit operation to avoid truncation.
45449 if (Subtarget.hasAVX() &&
45450 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45451 SExtVT = MVT::v4i64;
45452 PropagateSExt = true;
45453 }
45454 break;
45455 case MVT::v8i1:
45456 SExtVT = MVT::v8i16;
45457 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45458 // sign-extend to a 256-bit operation to match the compare.
45459 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45460 // 256-bit because the shuffle is cheaper than sign extending the result of
45461 // the compare.
45462 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45463 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45464 SExtVT = MVT::v8i32;
45465 PropagateSExt = true;
45466 }
45467 break;
45468 case MVT::v16i1:
45469 SExtVT = MVT::v16i8;
45470 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45471 // it is not profitable to sign-extend to 256-bit because this will
45472 // require an extra cross-lane shuffle which is more expensive than
45473 // truncating the result of the compare to 128-bits.
45474 break;
45475 case MVT::v32i1:
45476 SExtVT = MVT::v32i8;
45477 break;
45478 case MVT::v64i1:
45479 // If we have AVX512F, but not AVX512BW and the input is truncated from
45480 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45481 if (Subtarget.hasAVX512()) {
45482 if (Subtarget.hasBWI())
45483 return SDValue();
45484 SExtVT = MVT::v64i8;
45485 break;
45486 }
45487 // Split if this is a <64 x i8> comparison result.
45488 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45489 SExtVT = MVT::v64i8;
45490 break;
45491 }
45492 return SDValue();
45493 };
45494
45495 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45496 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45497
45498 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45499 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45500 } else {
45501 if (SExtVT == MVT::v8i16) {
45502 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45503 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45504 }
45505 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45506 }
45507
45508 EVT IntVT =
45510 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45511 return DAG.getBitcast(VT, V);
45512}
45513
45514// Convert a vXi1 constant build vector to the same width scalar integer.
45516 EVT SrcVT = Op.getValueType();
45517 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45518 "Expected a vXi1 vector");
45520 "Expected a constant build vector");
45521
45522 APInt Imm(SrcVT.getVectorNumElements(), 0);
45523 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45524 SDValue In = Op.getOperand(Idx);
45525 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45526 Imm.setBit(Idx);
45527 }
45528 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45529 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45530}
45531
45534 const X86Subtarget &Subtarget) {
45535 using namespace SDPatternMatch;
45536 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45537
45538 if (!DCI.isBeforeLegalizeOps())
45539 return SDValue();
45540
45541 // Only do this if we have k-registers.
45542 if (!Subtarget.hasAVX512())
45543 return SDValue();
45544
45545 EVT DstVT = N->getValueType(0);
45546 SDValue Op = N->getOperand(0);
45547 EVT SrcVT = Op.getValueType();
45548
45549 // Make sure we have a bitcast between mask registers and a scalar type.
45550 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45551 DstVT.isScalarInteger()) &&
45552 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45553 SrcVT.isScalarInteger()))
45554 return SDValue();
45555
45556 SDValue LHS, RHS;
45557
45558 // Look for logic ops.
45559 if (!sd_match(Op, m_OneUse(m_BitwiseLogic(m_Value(LHS), m_Value(RHS)))))
45560 return SDValue();
45561
45562 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45563 // least one of the getBitcast() will fold away).
45564 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45565 sd_match(RHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))))
45566 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45567 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45568
45569 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45570 // Most of these have to move a constant from the scalar domain anyway.
45573 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45574 DAG.getBitcast(DstVT, LHS), RHS);
45575 }
45576
45577 return SDValue();
45578}
45579
45581 const X86Subtarget &Subtarget) {
45582 SDLoc DL(BV);
45583 unsigned NumElts = BV->getNumOperands();
45584 SDValue Splat = BV->getSplatValue();
45585
45586 // Build MMX element from integer GPR or SSE float values.
45587 auto CreateMMXElement = [&](SDValue V) {
45588 if (V.isUndef())
45589 return DAG.getUNDEF(MVT::x86mmx);
45590 if (V.getValueType().isFloatingPoint()) {
45591 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45592 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45593 V = DAG.getBitcast(MVT::v2i64, V);
45594 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45595 }
45596 V = DAG.getBitcast(MVT::i32, V);
45597 } else {
45598 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45599 }
45600 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45601 };
45602
45603 // Convert build vector ops to MMX data in the bottom elements.
45605
45606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45607
45608 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45609 if (Splat) {
45610 if (Splat.isUndef())
45611 return DAG.getUNDEF(MVT::x86mmx);
45612
45613 Splat = CreateMMXElement(Splat);
45614
45615 if (Subtarget.hasSSE1()) {
45616 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45617 if (NumElts == 8)
45618 Splat = DAG.getNode(
45619 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45620 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45621 TLI.getPointerTy(DAG.getDataLayout())),
45622 Splat, Splat);
45623
45624 // Use PSHUFW to repeat 16-bit elements.
45625 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45626 return DAG.getNode(
45627 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45628 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45629 TLI.getPointerTy(DAG.getDataLayout())),
45630 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45631 }
45632 Ops.append(NumElts, Splat);
45633 } else {
45634 for (unsigned i = 0; i != NumElts; ++i)
45635 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45636 }
45637
45638 // Use tree of PUNPCKLs to build up general MMX vector.
45639 while (Ops.size() > 1) {
45640 unsigned NumOps = Ops.size();
45641 unsigned IntrinOp =
45642 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45643 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45644 : Intrinsic::x86_mmx_punpcklbw));
45645 SDValue Intrin = DAG.getTargetConstant(
45646 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45647 for (unsigned i = 0; i != NumOps; i += 2)
45648 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45649 Ops[i], Ops[i + 1]);
45650 Ops.resize(NumOps / 2);
45651 }
45652
45653 return Ops[0];
45654}
45655
45656// Recursive function that attempts to find if a bool vector node was originally
45657// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45658// integer. If so, replace the scalar ops with bool vector equivalents back down
45659// the chain.
45661 SelectionDAG &DAG,
45662 const X86Subtarget &Subtarget,
45663 unsigned Depth = 0) {
45665 return SDValue(); // Limit search depth.
45666
45667 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45668 unsigned Opc = V.getOpcode();
45669 switch (Opc) {
45670 case ISD::BITCAST: {
45671 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45672 SDValue Src = V.getOperand(0);
45673 EVT SrcVT = Src.getValueType();
45674 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45675 return DAG.getBitcast(VT, Src);
45676 break;
45677 }
45678 case ISD::Constant: {
45679 auto *C = cast<ConstantSDNode>(V);
45680 if (C->isZero())
45681 return DAG.getConstant(0, DL, VT);
45682 if (C->isAllOnes())
45683 return DAG.getAllOnesConstant(DL, VT);
45684 break;
45685 }
45686 case ISD::TRUNCATE: {
45687 // If we find a suitable source, a truncated scalar becomes a subvector.
45688 SDValue Src = V.getOperand(0);
45689 EVT NewSrcVT =
45690 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45691 if (TLI.isTypeLegal(NewSrcVT))
45692 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45693 Subtarget, Depth + 1))
45694 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45695 DAG.getVectorIdxConstant(0, DL));
45696 break;
45697 }
45698 case ISD::ANY_EXTEND:
45699 case ISD::ZERO_EXTEND: {
45700 // If we find a suitable source, an extended scalar becomes a subvector.
45701 SDValue Src = V.getOperand(0);
45702 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45703 Src.getScalarValueSizeInBits());
45704 if (TLI.isTypeLegal(NewSrcVT))
45705 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45706 Subtarget, Depth + 1))
45707 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45708 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45709 : DAG.getConstant(0, DL, VT),
45710 N0, DAG.getVectorIdxConstant(0, DL));
45711 break;
45712 }
45713 case ISD::OR:
45714 case ISD::XOR: {
45715 // If we find suitable sources, we can just move the op to the vector
45716 // domain.
45717 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45718 Subtarget, Depth + 1))
45719 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45720 Subtarget, Depth + 1))
45721 return DAG.getNode(Opc, DL, VT, N0, N1);
45722 break;
45723 }
45724 case ISD::SHL: {
45725 // If we find a suitable source, a SHL becomes a KSHIFTL.
45726 SDValue Src0 = V.getOperand(0);
45727 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45728 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45729 break;
45730
45731 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45732 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45733 Depth + 1))
45734 return DAG.getNode(
45735 X86ISD::KSHIFTL, DL, VT, N0,
45736 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45737 break;
45738 }
45739 }
45740
45741 // Does the inner bitcast already exist?
45742 if (Depth > 0)
45743 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45744 return SDValue(Alt, 0);
45745
45746 return SDValue();
45747}
45748
45751 const X86Subtarget &Subtarget) {
45752 SDValue N0 = N->getOperand(0);
45753 EVT VT = N->getValueType(0);
45754 EVT SrcVT = N0.getValueType();
45755 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45756
45757 // Try to match patterns such as
45758 // (i16 bitcast (v16i1 x))
45759 // ->
45760 // (i16 movmsk (16i8 sext (v16i1 x)))
45761 // before the setcc result is scalarized on subtargets that don't have legal
45762 // vxi1 types.
45763 if (DCI.isBeforeLegalize()) {
45764 SDLoc dl(N);
45765 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45766 return V;
45767
45768 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45769 // type, widen both sides to avoid a trip through memory.
45770 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45771 Subtarget.hasAVX512()) {
45772 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45773 N0 = DAG.getBitcast(MVT::v8i1, N0);
45774 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45775 DAG.getVectorIdxConstant(0, dl));
45776 }
45777
45778 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45779 // type, widen both sides to avoid a trip through memory.
45780 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45781 Subtarget.hasAVX512()) {
45782 // Use zeros for the widening if we already have some zeroes. This can
45783 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45784 // stream of this.
45785 // FIXME: It might make sense to detect a concat_vectors with a mix of
45786 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45787 // a separate combine. What we can't do is canonicalize the operands of
45788 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45789 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45790 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45791 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45792 SrcVT = LastOp.getValueType();
45793 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45794 SmallVector<SDValue, 4> Ops(N0->ops());
45795 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45796 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45797 N0 = DAG.getBitcast(MVT::i8, N0);
45798 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45799 }
45800 }
45801
45802 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45803 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45804 Ops[0] = N0;
45805 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45806 N0 = DAG.getBitcast(MVT::i8, N0);
45807 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45808 }
45809 } else if (DCI.isAfterLegalizeDAG()) {
45810 // If we're bitcasting from iX to vXi1, see if the integer originally
45811 // began as a vXi1 and whether we can remove the bitcast entirely.
45812 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45813 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45814 if (SDValue V =
45815 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45816 return V;
45817 }
45818 }
45819
45820 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45821 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45822 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45823 // we can help with known bits propagation from the vXi1 domain to the
45824 // scalar domain.
45825 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45826 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45827 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45829 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45830 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45831
45832 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45833 // and the vbroadcast_load are both integer or both fp. In some cases this
45834 // will remove the bitcast entirely.
45835 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45836 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45837 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45838 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45839 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45840 // Don't swap i8/i16 since don't have fp types that size.
45841 if (MemSize >= 32) {
45842 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45843 : MVT::getIntegerVT(MemSize);
45844 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45845 : MVT::getIntegerVT(SrcVTSize);
45846 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45847
45848 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45849 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45850 SDValue ResNode =
45852 MemVT, BCast->getMemOperand());
45853 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45854 return DAG.getBitcast(VT, ResNode);
45855 }
45856 }
45857
45858 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45859 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45860 SDValue Src = peekThroughTruncates(N0);
45861 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45862 Src.getOperand(0).getValueSizeInBits() == 128 &&
45863 isNullConstant(Src.getOperand(1))) {
45864 SDLoc DL(N);
45865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45866 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45867 DAG.getVectorIdxConstant(0, DL));
45868 }
45869 }
45870
45871 // Since MMX types are special and don't usually play with other vector types,
45872 // it's better to handle them early to be sure we emit efficient code by
45873 // avoiding store-load conversions.
45874 if (VT == MVT::x86mmx) {
45875 // Detect MMX constant vectors.
45876 APInt UndefElts;
45877 SmallVector<APInt, 1> EltBits;
45878 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45879 /*AllowWholeUndefs*/ true,
45880 /*AllowPartialUndefs*/ true)) {
45881 SDLoc DL(N0);
45882 // Handle zero-extension of i32 with MOVD.
45883 if (EltBits[0].countl_zero() >= 32)
45884 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45885 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45886 // Else, bitcast to a double.
45887 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45888 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45889 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45890 }
45891
45892 // Detect bitcasts to x86mmx low word.
45893 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45894 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45895 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45896 bool LowUndef = true, AllUndefOrZero = true;
45897 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45898 SDValue Op = N0.getOperand(i);
45899 LowUndef &= Op.isUndef() || (i >= e/2);
45900 AllUndefOrZero &= isNullConstantOrUndef(Op);
45901 }
45902 if (AllUndefOrZero) {
45903 SDValue N00 = N0.getOperand(0);
45904 SDLoc dl(N00);
45905 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45906 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45907 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45908 }
45909 }
45910
45911 // Detect bitcasts of 64-bit build vectors and convert to a
45912 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45913 // lowest element.
45914 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45915 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45916 SrcVT == MVT::v8i8))
45917 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45918
45919 // Detect bitcasts between element or subvector extraction to x86mmx.
45920 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45922 isNullConstant(N0.getOperand(1))) {
45923 SDValue N00 = N0.getOperand(0);
45924 if (N00.getValueType().is128BitVector())
45925 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45926 DAG.getBitcast(MVT::v2i64, N00));
45927 }
45928
45929 // Detect bitcasts from FP_TO_SINT to x86mmx.
45930 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45931 SDLoc DL(N0);
45932 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45933 DAG.getUNDEF(MVT::v2i32));
45934 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45935 DAG.getBitcast(MVT::v2i64, Res));
45936 }
45937 }
45938
45939 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45940 // most of these to scalar anyway.
45941 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45942 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45944 return combinevXi1ConstantToInteger(N0, DAG);
45945 }
45946
45947 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
45948 VT.getVectorElementType() == MVT::i1) {
45949 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
45950 if (C->isAllOnes())
45951 return DAG.getConstant(1, SDLoc(N0), VT);
45952 if (C->isZero())
45953 return DAG.getConstant(0, SDLoc(N0), VT);
45954 }
45955 }
45956
45957 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45958 // Turn it into a sign bit compare that produces a k-register. This avoids
45959 // a trip through a GPR.
45960 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45961 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45963 unsigned NumElts = VT.getVectorNumElements();
45964 SDValue Src = N0;
45965
45966 // Peek through truncate.
45967 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
45968 Src = N0.getOperand(0);
45969
45970 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
45971 SDValue MovmskIn = Src.getOperand(0);
45972 MVT MovmskVT = MovmskIn.getSimpleValueType();
45973 unsigned MovMskElts = MovmskVT.getVectorNumElements();
45974
45975 // We allow extra bits of the movmsk to be used since they are known zero.
45976 // We can't convert a VPMOVMSKB without avx512bw.
45977 if (MovMskElts <= NumElts &&
45978 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
45979 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
45980 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
45981 SDLoc dl(N);
45982 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
45983 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
45984 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
45985 if (EVT(CmpVT) == VT)
45986 return Cmp;
45987
45988 // Pad with zeroes up to original VT to replace the zeroes that were
45989 // being used from the MOVMSK.
45990 unsigned NumConcats = NumElts / MovMskElts;
45991 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
45992 Ops[0] = Cmp;
45993 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
45994 }
45995 }
45996 }
45997
45998 // Try to remove bitcasts from input and output of mask arithmetic to
45999 // remove GPR<->K-register crossings.
46000 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46001 return V;
46002
46003 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46004 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46005 SrcVT.getVectorNumElements() == 1)
46006 return N0.getOperand(1);
46007
46008 // Convert a bitcasted integer logic operation that has one bitcasted
46009 // floating-point operand into a floating-point logic operation. This may
46010 // create a load of a constant, but that is cheaper than materializing the
46011 // constant in an integer register and transferring it to an SSE register or
46012 // transferring the SSE operand to integer register and back.
46013 unsigned FPOpcode;
46014 switch (N0.getOpcode()) {
46015 // clang-format off
46016 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46017 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46018 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46019 default: return SDValue();
46020 // clang-format on
46021 }
46022
46023 // Check if we have a bitcast from another integer type as well.
46024 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46025 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46026 (Subtarget.hasFP16() && VT == MVT::f16) ||
46027 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46028 TLI.isTypeLegal(VT))))
46029 return SDValue();
46030
46031 SDValue LogicOp0 = N0.getOperand(0);
46032 SDValue LogicOp1 = N0.getOperand(1);
46033 SDLoc DL0(N0);
46034
46035 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46036 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46037 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46038 LogicOp0.getOperand(0).getValueType() == VT &&
46039 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46040 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46041 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46042 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46043 }
46044 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46045 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46046 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46047 LogicOp1.getOperand(0).getValueType() == VT &&
46048 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46049 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46050 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46051 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46052 }
46053
46054 return SDValue();
46055}
46056
46057// (mul (zext a), (sext, b))
46058static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46059 SDValue &Op1) {
46060 Op0 = Mul.getOperand(0);
46061 Op1 = Mul.getOperand(1);
46062
46063 // The operand1 should be signed extend
46064 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46065 std::swap(Op0, Op1);
46066
46067 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46068 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46069 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46070 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46071 return true;
46072
46073 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46074 return (BV && BV->isConstant());
46075 };
46076
46077 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46078 // value, we need to check Op0 is zero extended value. Op1 should be signed
46079 // value, so we just check the signed bits.
46080 if ((IsFreeTruncation(Op0) &&
46081 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46082 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46083 return true;
46084
46085 return false;
46086}
46087
46089 unsigned &LogBias, const SDLoc &DL,
46090 const X86Subtarget &Subtarget) {
46091 // Extend or truncate to MVT::i8 first.
46092 MVT Vi8VT =
46093 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46094 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46095 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46096
46097 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46098 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46099 // The src A, B element type is i8, but the dst C element type is i32.
46100 // When we calculate the reduce stage, we use src vector type vXi8 for it
46101 // so we need logbias 2 to avoid extra 2 stages.
46102 LogBias = 2;
46103
46104 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46105 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46106 RegSize = std::max(512u, RegSize);
46107
46108 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46109 // fill in the missing vector elements with 0.
46110 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46111 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46112 Ops[0] = LHS;
46113 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46114 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46115 Ops[0] = RHS;
46116 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46117
46118 // Actually build the DotProduct, split as 256/512 bits for
46119 // AVXVNNI/AVX512VNNI.
46120 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46121 ArrayRef<SDValue> Ops) {
46122 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46123 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46124 };
46125 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46126 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46127
46128 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46129 DpBuilder, false);
46130}
46131
46132// Create a PSADBW given two sources representable as zexts of vXi8.
46134 const SDLoc &DL, const X86Subtarget &Subtarget) {
46135 // Find the appropriate width for the PSADBW.
46136 EVT DstVT = N0.getValueType();
46137 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46138 DstVT.getVectorElementCount());
46139 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46140
46141 // Widen the vXi8 vectors, padding with zero vector elements.
46142 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46143 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46144 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46145 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46146 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46147 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46148 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46149
46150 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46151 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46152 ArrayRef<SDValue> Ops) {
46153 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46154 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46155 };
46156 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46157 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46158 PSADBWBuilder);
46159}
46160
46161// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46162// PHMINPOSUW.
46164 const X86Subtarget &Subtarget) {
46165 // Bail without SSE41.
46166 if (!Subtarget.hasSSE41())
46167 return SDValue();
46168
46169 EVT ExtractVT = Extract->getValueType(0);
46170 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46171 return SDValue();
46172
46173 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46174 ISD::NodeType BinOp;
46175 SDValue Src = DAG.matchBinOpReduction(
46176 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46177 if (!Src)
46178 return SDValue();
46179
46180 EVT SrcVT = Src.getValueType();
46181 EVT SrcSVT = SrcVT.getScalarType();
46182 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46183 return SDValue();
46184
46185 SDLoc DL(Extract);
46186 SDValue MinPos = Src;
46187
46188 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46189 while (SrcVT.getSizeInBits() > 128) {
46190 SDValue Lo, Hi;
46191 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46192 SrcVT = Lo.getValueType();
46193 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46194 }
46195 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46196 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46197 "Unexpected value type");
46198
46199 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46200 // to flip the value accordingly.
46201 SDValue Mask;
46202 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46203 if (BinOp == ISD::SMAX)
46204 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46205 else if (BinOp == ISD::SMIN)
46206 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46207 else if (BinOp == ISD::UMAX)
46208 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46209
46210 if (Mask)
46211 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46212
46213 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46214 // shuffling each upper element down and insert zeros. This means that the
46215 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46216 // ready for the PHMINPOS.
46217 if (ExtractVT == MVT::i8) {
46219 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46220 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46221 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46222 }
46223
46224 // Perform the PHMINPOS on a v8i16 vector,
46225 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46226 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46227 MinPos = DAG.getBitcast(SrcVT, MinPos);
46228
46229 if (Mask)
46230 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46231
46232 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46233 DAG.getVectorIdxConstant(0, DL));
46234}
46235
46236// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46238 const X86Subtarget &Subtarget) {
46239 // Bail without SSE2.
46240 if (!Subtarget.hasSSE2())
46241 return SDValue();
46242
46243 EVT ExtractVT = Extract->getValueType(0);
46244 unsigned BitWidth = ExtractVT.getSizeInBits();
46245 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46246 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46247 return SDValue();
46248
46249 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46250 ISD::NodeType BinOp;
46251 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46252 if (!Match && ExtractVT == MVT::i1)
46253 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46254 if (!Match)
46255 return SDValue();
46256
46257 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46258 // which we can't support here for now.
46259 if (Match.getScalarValueSizeInBits() != BitWidth)
46260 return SDValue();
46261
46262 SDValue Movmsk;
46263 SDLoc DL(Extract);
46264 EVT MatchVT = Match.getValueType();
46265 unsigned NumElts = MatchVT.getVectorNumElements();
46266 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46267 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46268 LLVMContext &Ctx = *DAG.getContext();
46269
46270 if (ExtractVT == MVT::i1) {
46271 // Special case for (pre-legalization) vXi1 reductions.
46272 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46273 return SDValue();
46274 if (Match.getOpcode() == ISD::SETCC) {
46275 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46276 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46277 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46278 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46279 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46280 X86::CondCode X86CC;
46281 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46282 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46283 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46284 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46285 DAG, X86CC))
46286 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46287 getSETCC(X86CC, V, DL, DAG));
46288 }
46289 }
46290 if (TLI.isTypeLegal(MatchVT)) {
46291 // If this is a legal AVX512 predicate type then we can just bitcast.
46292 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46293 Movmsk = DAG.getBitcast(MovmskVT, Match);
46294 } else {
46295 // Use combineBitcastvxi1 to create the MOVMSK.
46296 while (NumElts > MaxElts) {
46297 SDValue Lo, Hi;
46298 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46299 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46300 NumElts /= 2;
46301 }
46302 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46303 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46304 }
46305 if (!Movmsk)
46306 return SDValue();
46307 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46308 } else {
46309 // FIXME: Better handling of k-registers or 512-bit vectors?
46310 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46311 if (!(MatchSizeInBits == 128 ||
46312 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46313 return SDValue();
46314
46315 // Make sure this isn't a vector of 1 element. The perf win from using
46316 // MOVMSK diminishes with less elements in the reduction, but it is
46317 // generally better to get the comparison over to the GPRs as soon as
46318 // possible to reduce the number of vector ops.
46319 if (Match.getValueType().getVectorNumElements() < 2)
46320 return SDValue();
46321
46322 // Check that we are extracting a reduction of all sign bits.
46323 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46324 return SDValue();
46325
46326 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46327 SDValue Lo, Hi;
46328 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46329 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46330 MatchSizeInBits = Match.getValueSizeInBits();
46331 }
46332
46333 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46334 MVT MaskSrcVT;
46335 if (64 == BitWidth || 32 == BitWidth)
46337 MatchSizeInBits / BitWidth);
46338 else
46339 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46340
46341 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46342 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46343 NumElts = MaskSrcVT.getVectorNumElements();
46344 }
46345 assert((NumElts <= 32 || NumElts == 64) &&
46346 "Not expecting more than 64 elements");
46347
46348 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46349 if (BinOp == ISD::XOR) {
46350 // parity -> (PARITY(MOVMSK X))
46351 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46352 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46353 }
46354
46355 SDValue CmpC;
46356 ISD::CondCode CondCode;
46357 if (BinOp == ISD::OR) {
46358 // any_of -> MOVMSK != 0
46359 CmpC = DAG.getConstant(0, DL, CmpVT);
46360 CondCode = ISD::CondCode::SETNE;
46361 } else {
46362 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46363 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46364 DL, CmpVT);
46365 CondCode = ISD::CondCode::SETEQ;
46366 }
46367
46368 // The setcc produces an i8 of 0/1, so extend that to the result width and
46369 // negate to get the final 0/-1 mask value.
46370 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46371 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46372 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46373 return DAG.getNegative(Zext, DL, ExtractVT);
46374}
46375
46377 const X86Subtarget &Subtarget) {
46378 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46379 return SDValue();
46380
46381 EVT ExtractVT = Extract->getValueType(0);
46382 // Verify the type we're extracting is i32, as the output element type of
46383 // vpdpbusd is i32.
46384 if (ExtractVT != MVT::i32)
46385 return SDValue();
46386
46387 EVT VT = Extract->getOperand(0).getValueType();
46389 return SDValue();
46390
46391 // Match shuffle + add pyramid.
46392 ISD::NodeType BinOp;
46393 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46394
46395 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46396 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46397 // before adding into the accumulator.
46398 // TODO:
46399 // We also need to verify that the multiply has at least 2x the number of bits
46400 // of the input. We shouldn't match
46401 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46402 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46403 // Root = Root.getOperand(0);
46404
46405 // If there was a match, we want Root to be a mul.
46406 if (!Root || Root.getOpcode() != ISD::MUL)
46407 return SDValue();
46408
46409 // Check whether we have an extend and mul pattern
46410 SDValue LHS, RHS;
46411 if (!detectExtMul(DAG, Root, LHS, RHS))
46412 return SDValue();
46413
46414 // Create the dot product instruction.
46415 SDLoc DL(Extract);
46416 unsigned StageBias;
46417 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46418
46419 // If the original vector was wider than 4 elements, sum over the results
46420 // in the DP vector.
46421 unsigned Stages = Log2_32(VT.getVectorNumElements());
46422 EVT DpVT = DP.getValueType();
46423
46424 if (Stages > StageBias) {
46425 unsigned DpElems = DpVT.getVectorNumElements();
46426
46427 for (unsigned i = Stages - StageBias; i > 0; --i) {
46428 SmallVector<int, 16> Mask(DpElems, -1);
46429 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46430 Mask[j] = MaskEnd + j;
46431
46432 SDValue Shuffle =
46433 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46434 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46435 }
46436 }
46437
46438 // Return the lowest ExtractSizeInBits bits.
46439 EVT ResVT =
46440 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46441 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46442 DP = DAG.getBitcast(ResVT, DP);
46443 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46444 Extract->getOperand(1));
46445}
46446
46448 const X86Subtarget &Subtarget) {
46449 using namespace SDPatternMatch;
46450
46451 // PSADBW is only supported on SSE2 and up.
46452 if (!Subtarget.hasSSE2())
46453 return SDValue();
46454
46455 EVT ExtractVT = Extract->getValueType(0);
46456 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46457 ExtractVT != MVT::i64)
46458 return SDValue();
46459
46460 EVT VT = Extract->getOperand(0).getValueType();
46462 return SDValue();
46463
46464 // Match shuffle + add pyramid.
46465 ISD::NodeType BinOp;
46466 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46467 if (!Root)
46468 return SDValue();
46469
46470 // The operand is expected to be zero extended from i8.
46471 // In order to convert to i64 and above, additional any/zero/sign
46472 // extend is expected.
46473 // The zero extend from 32 bit has no mathematical effect on the result.
46474 // Also the sign extend is basically zero extend
46475 // (extends the sign bit which is zero).
46476 // So it is correct to skip the sign/zero extend instruction.
46477 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46478 Root.getOpcode() == ISD::ZERO_EXTEND ||
46479 Root.getOpcode() == ISD::ANY_EXTEND)
46480 Root = Root.getOperand(0);
46481
46482 // Check whether we have an vXi8 abdu pattern.
46483 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46484 SDValue Src0, Src1;
46485 if (!sd_match(
46486 Root,
46487 m_AnyOf(
46488 m_SpecificVectorElementVT(
46489 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46490 m_SpecificVectorElementVT(
46491 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46492 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46493 m_Abs(
46494 m_Sub(m_AllOf(m_Value(Src0),
46495 m_ZExt(m_SpecificVectorElementVT(MVT::i8))),
46496 m_AllOf(m_Value(Src1),
46497 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46498 return SDValue();
46499
46500 // Create the SAD instruction.
46501 SDLoc DL(Extract);
46502 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46503
46504 // If the original vector was wider than 8 elements, sum over the results
46505 // in the SAD vector.
46506 unsigned Stages = Log2_32(VT.getVectorNumElements());
46507 EVT SadVT = SAD.getValueType();
46508 if (Stages > 3) {
46509 unsigned SadElems = SadVT.getVectorNumElements();
46510
46511 for(unsigned i = Stages - 3; i > 0; --i) {
46512 SmallVector<int, 16> Mask(SadElems, -1);
46513 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46514 Mask[j] = MaskEnd + j;
46515
46516 SDValue Shuffle =
46517 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46518 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46519 }
46520 }
46521
46522 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46523 // Return the lowest ExtractSizeInBits bits.
46524 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46525 SadVT.getSizeInBits() / ExtractSizeInBits);
46526 SAD = DAG.getBitcast(ResVT, SAD);
46527 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46528 Extract->getOperand(1));
46529}
46530
46531// If this extract is from a loaded vector value and will be used as an
46532// integer, that requires a potentially expensive XMM -> GPR transfer.
46533// Additionally, if we can convert to a scalar integer load, that will likely
46534// be folded into a subsequent integer op.
46535// Note: SrcVec might not have a VecVT type, but it must be the same size.
46536// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46537// to a single-use of the loaded vector. For the reasons above, we
46538// expect this to be profitable even if it creates an extra load.
46539static SDValue
46541 const SDLoc &dl, SelectionDAG &DAG,
46543 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46544 "Only EXTRACT_VECTOR_ELT supported so far");
46545
46546 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46547 EVT VT = N->getValueType(0);
46548
46549 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46550 return Use->getOpcode() == ISD::STORE ||
46551 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46552 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46553 });
46554
46555 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46556 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46557 VecVT.getVectorElementType() == VT &&
46558 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46559 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46560 SDValue NewPtr = TLI.getVectorElementPointer(
46561 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46562 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46563 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46564 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46565 SDValue Load =
46566 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46567 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46568 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46569 return Load;
46570 }
46571
46572 return SDValue();
46573}
46574
46575// Attempt to peek through a target shuffle and extract the scalar from the
46576// source.
46579 const X86Subtarget &Subtarget) {
46580 if (DCI.isBeforeLegalizeOps())
46581 return SDValue();
46582
46583 SDLoc dl(N);
46584 SDValue Src = N->getOperand(0);
46585 SDValue Idx = N->getOperand(1);
46586
46587 EVT VT = N->getValueType(0);
46588 EVT SrcVT = Src.getValueType();
46589 EVT SrcSVT = SrcVT.getVectorElementType();
46590 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46591 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46592
46593 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46594 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46595 return SDValue();
46596
46597 const APInt &IdxC = N->getConstantOperandAPInt(1);
46598 if (IdxC.uge(NumSrcElts))
46599 return SDValue();
46600
46601 SDValue SrcBC = peekThroughBitcasts(Src);
46602
46603 // Handle extract(bitcast(broadcast(scalar_value))).
46604 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46605 SDValue SrcOp = SrcBC.getOperand(0);
46606 EVT SrcOpVT = SrcOp.getValueType();
46607 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46608 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46609 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46610 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46611 // TODO support non-zero offsets.
46612 if (Offset == 0) {
46613 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46614 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46615 return SrcOp;
46616 }
46617 }
46618 }
46619
46620 // If we're extracting a single element from a broadcast load and there are
46621 // no other users, just create a single load.
46623 SrcBC.hasOneUse()) {
46624 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46625 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46626 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46627 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46628 SDValue Load =
46629 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46630 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46631 MemIntr->getMemOperand()->getFlags());
46632 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46633 return Load;
46634 }
46635 }
46636
46637 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46638 // TODO: Move to DAGCombine?
46639 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46640 SrcBC.getValueType().isInteger() &&
46641 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46642 SrcBC.getScalarValueSizeInBits() ==
46643 SrcBC.getOperand(0).getValueSizeInBits()) {
46644 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46645 if (IdxC.ult(Scale)) {
46646 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46647 SDValue Scl = SrcBC.getOperand(0);
46648 EVT SclVT = Scl.getValueType();
46649 if (Offset) {
46650 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46651 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46652 }
46653 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46654 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46655 return Scl;
46656 }
46657 }
46658
46659 // Handle extract(truncate(x)) for 0'th index.
46660 // TODO: Treat this as a faux shuffle?
46661 // TODO: When can we use this for general indices?
46662 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46663 (SrcVT.getSizeInBits() % 128) == 0) {
46664 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46665 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46666 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46667 Idx);
46668 }
46669
46670 // We can only legally extract other elements from 128-bit vectors and in
46671 // certain circumstances, depending on SSE-level.
46672 // TODO: Investigate float/double extraction if it will be just stored.
46673 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46674 unsigned Idx) {
46675 EVT VecSVT = VecVT.getScalarType();
46676 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46677 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46678 VecSVT == MVT::i64)) {
46679 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46680 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46681 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46682 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46683 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46684 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46685 Idx &= (NumEltsPerLane - 1);
46686 }
46687 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46688 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46689 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46690 DAG.getBitcast(VecVT, Vec),
46691 DAG.getVectorIdxConstant(Idx, dl));
46692 }
46693 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46694 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46695 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46696 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46697 DAG.getTargetConstant(Idx, dl, MVT::i8));
46698 }
46699 return SDValue();
46700 };
46701
46702 // Resolve the target shuffle inputs and mask.
46705 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46706 return SDValue();
46707
46708 // Shuffle inputs must be the same size as the result.
46709 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46710 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46711 }))
46712 return SDValue();
46713
46714 // Attempt to narrow/widen the shuffle mask to the correct size.
46715 if (Mask.size() != NumSrcElts) {
46716 if ((NumSrcElts % Mask.size()) == 0) {
46717 SmallVector<int, 16> ScaledMask;
46718 int Scale = NumSrcElts / Mask.size();
46719 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46720 Mask = std::move(ScaledMask);
46721 } else if ((Mask.size() % NumSrcElts) == 0) {
46722 // Simplify Mask based on demanded element.
46723 int ExtractIdx = (int)IdxC.getZExtValue();
46724 int Scale = Mask.size() / NumSrcElts;
46725 int Lo = Scale * ExtractIdx;
46726 int Hi = Scale * (ExtractIdx + 1);
46727 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46728 if (i < Lo || Hi <= i)
46729 Mask[i] = SM_SentinelUndef;
46730
46731 SmallVector<int, 16> WidenedMask;
46732 while (Mask.size() > NumSrcElts &&
46733 canWidenShuffleElements(Mask, WidenedMask))
46734 Mask = std::move(WidenedMask);
46735 }
46736 }
46737
46738 // If narrowing/widening failed, see if we can extract+zero-extend.
46739 int ExtractIdx;
46740 EVT ExtractVT;
46741 if (Mask.size() == NumSrcElts) {
46742 ExtractIdx = Mask[IdxC.getZExtValue()];
46743 ExtractVT = SrcVT;
46744 } else {
46745 unsigned Scale = Mask.size() / NumSrcElts;
46746 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46747 return SDValue();
46748 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46749 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46750 return SDValue();
46751 ExtractIdx = Mask[ScaledIdx];
46752 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46753 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46754 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46755 "Failed to widen vector type");
46756 }
46757
46758 // If the shuffle source element is undef/zero then we can just accept it.
46759 if (ExtractIdx == SM_SentinelUndef)
46760 return DAG.getUNDEF(VT);
46761
46762 if (ExtractIdx == SM_SentinelZero)
46763 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46764 : DAG.getConstant(0, dl, VT);
46765
46766 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46767 ExtractIdx = ExtractIdx % Mask.size();
46768 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46769 return DAG.getZExtOrTrunc(V, dl, VT);
46770
46771 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46773 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46774 return V;
46775
46776 return SDValue();
46777}
46778
46779/// Extracting a scalar FP value from vector element 0 is free, so extract each
46780/// operand first, then perform the math as a scalar op.
46782 const X86Subtarget &Subtarget,
46784 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46785 SDValue Vec = ExtElt->getOperand(0);
46786 SDValue Index = ExtElt->getOperand(1);
46787 EVT VT = ExtElt->getValueType(0);
46788 EVT VecVT = Vec.getValueType();
46789
46790 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46791 // non-zero element because the shuffle+scalar op will be cheaper?
46792 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46793 return SDValue();
46794
46795 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46796 // extract, the condition code), so deal with those as a special-case.
46797 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46798 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46799 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46800 return SDValue();
46801
46802 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46803 SDLoc DL(ExtElt);
46804 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46805 Vec.getOperand(0), Index);
46806 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46807 Vec.getOperand(1), Index);
46808 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46809 }
46810
46811 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46812 VT != MVT::f64)
46813 return SDValue();
46814
46815 // Vector FP selects don't fit the pattern of FP math ops (because the
46816 // condition has a different type and we have to change the opcode), so deal
46817 // with those here.
46818 // FIXME: This is restricted to pre type legalization. If we loosen this we
46819 // need to convert vector bool to a scalar bool.
46820 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46821 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46822 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46823 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46824 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46825 SDLoc DL(ExtElt);
46828 Vec.getOperand(0), Index);
46829 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46830 Vec.getOperand(1), Index);
46831 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46832 Vec.getOperand(2), Index);
46833 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46834 }
46835
46836 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46837 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46838 // missed load folding and fma+fneg combining.
46839 switch (Vec.getOpcode()) {
46840 case ISD::FMA: // Begin 3 operands
46841 case ISD::FMAD:
46842 case ISD::FADD: // Begin 2 operands
46843 case ISD::FSUB:
46844 case ISD::FMUL:
46845 case ISD::FDIV:
46846 case ISD::FREM:
46847 case ISD::FCOPYSIGN:
46848 case ISD::FMINNUM:
46849 case ISD::FMAXNUM:
46850 case ISD::FMINNUM_IEEE:
46851 case ISD::FMAXNUM_IEEE:
46852 case ISD::FMAXIMUM:
46853 case ISD::FMINIMUM:
46854 case ISD::FMAXIMUMNUM:
46855 case ISD::FMINIMUMNUM:
46856 case X86ISD::FMAX:
46857 case X86ISD::FMIN:
46858 case ISD::FABS: // Begin 1 operand
46859 case ISD::FSQRT:
46860 case ISD::FRINT:
46861 case ISD::FCEIL:
46862 case ISD::FTRUNC:
46863 case ISD::FNEARBYINT:
46864 case ISD::FROUNDEVEN:
46865 case ISD::FROUND:
46866 case ISD::FFLOOR:
46867 case X86ISD::FRCP:
46868 case X86ISD::FRSQRT: {
46869 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46870 SDLoc DL(ExtElt);
46872 for (SDValue Op : Vec->ops())
46873 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46874 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46875 }
46876 default:
46877 return SDValue();
46878 }
46879 llvm_unreachable("All opcodes should return within switch");
46880}
46881
46882/// Try to convert a vector reduction sequence composed of binops and shuffles
46883/// into horizontal ops.
46885 const X86Subtarget &Subtarget) {
46886 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46887
46888 // We need at least SSE2 to anything here.
46889 if (!Subtarget.hasSSE2())
46890 return SDValue();
46891
46893 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46894 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46895 if (!Rdx)
46896 return SDValue();
46897
46898 SDValue Index = ExtElt->getOperand(1);
46899 assert(isNullConstant(Index) &&
46900 "Reduction doesn't end in an extract from index 0");
46901
46902 EVT VT = ExtElt->getValueType(0);
46903 EVT VecVT = Rdx.getValueType();
46904 if (VecVT.getScalarType() != VT)
46905 return SDValue();
46906
46907 SDLoc DL(ExtElt);
46908 unsigned NumElts = VecVT.getVectorNumElements();
46909 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46910
46911 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46912 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46913 if (V.getValueType() == MVT::v4i8) {
46914 if (ZeroExtend && Subtarget.hasSSE41()) {
46915 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46916 DAG.getConstant(0, DL, MVT::v4i32),
46917 DAG.getBitcast(MVT::i32, V),
46918 DAG.getVectorIdxConstant(0, DL));
46919 return DAG.getBitcast(MVT::v16i8, V);
46920 }
46921 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46922 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46923 : DAG.getUNDEF(MVT::v4i8));
46924 }
46925 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46926 DAG.getUNDEF(MVT::v8i8));
46927 };
46928
46929 // vXi8 mul reduction - promote to vXi16 mul reduction.
46930 if (Opc == ISD::MUL) {
46931 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46932 return SDValue();
46933 if (VecVT.getSizeInBits() >= 128) {
46934 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46935 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46936 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46937 Lo = DAG.getBitcast(WideVT, Lo);
46938 Hi = DAG.getBitcast(WideVT, Hi);
46939 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46940 while (Rdx.getValueSizeInBits() > 128) {
46941 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46942 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46943 }
46944 } else {
46945 Rdx = WidenToV16I8(Rdx, false);
46946 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46947 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46948 }
46949 if (NumElts >= 8)
46950 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46951 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46952 {4, 5, 6, 7, -1, -1, -1, -1}));
46953 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46954 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46955 {2, 3, -1, -1, -1, -1, -1, -1}));
46956 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46957 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46958 {1, -1, -1, -1, -1, -1, -1, -1}));
46959 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46960 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46961 }
46962
46963 // vXi8 add reduction - sub 128-bit vector.
46964 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
46965 Rdx = WidenToV16I8(Rdx, true);
46966 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46967 DAG.getConstant(0, DL, MVT::v16i8));
46968 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46969 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46970 }
46971
46972 // Must be a >=128-bit vector with pow2 elements.
46973 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
46974 return SDValue();
46975
46976 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
46977 if (VT == MVT::i8) {
46978 while (Rdx.getValueSizeInBits() > 128) {
46979 SDValue Lo, Hi;
46980 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46981 VecVT = Lo.getValueType();
46982 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46983 }
46984 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
46985
46987 MVT::v16i8, DL, Rdx, Rdx,
46988 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
46989 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
46990 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46991 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
46992 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46993 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46994 }
46995
46996 // See if we can use vXi8 PSADBW add reduction for larger zext types.
46997 // If the source vector values are 0-255, then we can use PSADBW to
46998 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
46999 // TODO: See if its worth avoiding vXi16/i32 truncations?
47000 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47001 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47002 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47003 Subtarget.hasAVX512())) {
47004 if (Rdx.getValueType() == MVT::v8i16) {
47005 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47006 DAG.getUNDEF(MVT::v8i16));
47007 } else {
47008 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47009 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47010 if (ByteVT.getSizeInBits() < 128)
47011 Rdx = WidenToV16I8(Rdx, true);
47012 }
47013
47014 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47015 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47016 ArrayRef<SDValue> Ops) {
47017 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47018 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47019 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47020 };
47021 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47022 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47023
47024 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47025 while (Rdx.getValueSizeInBits() > 128) {
47026 SDValue Lo, Hi;
47027 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47028 VecVT = Lo.getValueType();
47029 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47030 }
47031 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47032
47033 if (NumElts > 8) {
47034 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47035 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47036 }
47037
47038 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47039 Rdx = DAG.getBitcast(VecVT, Rdx);
47040 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47041 }
47042
47043 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47044 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47045 return SDValue();
47046
47047 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47048
47049 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47050 // across the whole vector, so we need an extract + hop preliminary stage.
47051 // This is the only step where the operands of the hop are not the same value.
47052 // TODO: We could extend this to handle 512-bit or even longer vectors.
47053 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47054 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47055 unsigned NumElts = VecVT.getVectorNumElements();
47056 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47057 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47058 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47059 VecVT = Rdx.getValueType();
47060 }
47061 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47062 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47063 return SDValue();
47064
47065 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47066 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47067 for (unsigned i = 0; i != ReductionSteps; ++i)
47068 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47069
47070 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47071}
47072
47073/// Detect vector gather/scatter index generation and convert it from being a
47074/// bunch of shuffles and extracts into a somewhat faster sequence.
47075/// For i686, the best sequence is apparently storing the value and loading
47076/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47079 const X86Subtarget &Subtarget) {
47080 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47081 return NewOp;
47082
47083 SDValue InputVector = N->getOperand(0);
47084 SDValue EltIdx = N->getOperand(1);
47085 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47086
47087 EVT SrcVT = InputVector.getValueType();
47088 EVT VT = N->getValueType(0);
47089 SDLoc dl(InputVector);
47090 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47091 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47092 unsigned NumEltBits = VT.getScalarSizeInBits();
47093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47094
47095 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47096 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47097
47098 // Integer Constant Folding.
47099 if (CIdx && VT.isInteger()) {
47100 APInt UndefVecElts;
47101 SmallVector<APInt, 16> EltBits;
47102 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47103 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47104 EltBits, /*AllowWholeUndefs*/ true,
47105 /*AllowPartialUndefs*/ false)) {
47106 uint64_t Idx = CIdx->getZExtValue();
47107 if (UndefVecElts[Idx])
47108 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47109 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47110 }
47111
47112 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47113 // Improves lowering of bool masks on rust which splits them into byte array.
47114 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47115 SDValue Src = peekThroughBitcasts(InputVector);
47116 if (Src.getValueType().getScalarType() == MVT::i1 &&
47117 TLI.isTypeLegal(Src.getValueType())) {
47118 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47119 SDValue Sub = DAG.getNode(
47120 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47121 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47122 return DAG.getBitcast(VT, Sub);
47123 }
47124 }
47125 }
47126
47127 if (IsPextr) {
47128 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47129 DCI))
47130 return SDValue(N, 0);
47131
47132 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47133 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47134 InputVector.getOpcode() == X86ISD::PINSRW) &&
47135 InputVector.getOperand(2) == EltIdx) {
47136 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47137 "Vector type mismatch");
47138 SDValue Scl = InputVector.getOperand(1);
47139 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47140 return DAG.getZExtOrTrunc(Scl, dl, VT);
47141 }
47142
47143 // TODO - Remove this once we can handle the implicit zero-extension of
47144 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47145 // combineBasicSADPattern.
47146 return SDValue();
47147 }
47148
47149 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47150 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47151 InputVector.getOpcode() == ISD::BITCAST &&
47152 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47153 isNullConstant(EltIdx) && InputVector.hasOneUse())
47154 return DAG.getBitcast(VT, InputVector);
47155
47156 // Detect mmx to i32 conversion through a v2i32 elt extract.
47157 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47158 InputVector.getOpcode() == ISD::BITCAST &&
47159 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47160 isNullConstant(EltIdx) && InputVector.hasOneUse())
47161 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47162 InputVector.getOperand(0));
47163
47164 // Check whether this extract is the root of a sum of absolute differences
47165 // pattern. This has to be done here because we really want it to happen
47166 // pre-legalization,
47167 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47168 return SAD;
47169
47170 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47171 return VPDPBUSD;
47172
47173 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47174 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47175 return Cmp;
47176
47177 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47178 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47179 return MinMax;
47180
47181 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47182 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47183 return V;
47184
47185 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47186 return V;
47187
47188 if (CIdx)
47190 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47191 dl, DAG, DCI))
47192 return V;
47193
47194 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47195 // and then testing the relevant element.
47196 //
47197 // Note that we only combine extracts on the *same* result number, i.e.
47198 // t0 = merge_values a0, a1, a2, a3
47199 // i1 = extract_vector_elt t0, Constant:i64<2>
47200 // i1 = extract_vector_elt t0, Constant:i64<3>
47201 // but not
47202 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47203 // since the latter would need its own MOVMSK.
47204 if (SrcVT.getScalarType() == MVT::i1) {
47205 bool IsVar = !CIdx;
47206 SmallVector<SDNode *, 16> BoolExtracts;
47207 unsigned ResNo = InputVector.getResNo();
47208 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47209 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47210 Use->getOperand(0).getResNo() == ResNo &&
47211 Use->getValueType(0) == MVT::i1) {
47212 BoolExtracts.push_back(Use);
47213 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47214 return true;
47215 }
47216 return false;
47217 };
47218 // TODO: Can we drop the oneuse check for constant extracts?
47219 if (all_of(InputVector->users(), IsBoolExtract) &&
47220 (IsVar || BoolExtracts.size() > 1)) {
47221 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47222 if (SDValue BC =
47223 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47224 for (SDNode *Use : BoolExtracts) {
47225 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47226 // Mask = 1 << MaskIdx
47227 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47228 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47229 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47230 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47231 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47232 DCI.CombineTo(Use, Res);
47233 }
47234 return SDValue(N, 0);
47235 }
47236 }
47237 }
47238
47239 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47240 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47241 SDValue TruncSrc = InputVector.getOperand(0);
47242 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47243 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47244 SDValue NewExt =
47245 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47246 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47247 }
47248 }
47249
47250 return SDValue();
47251}
47252
47253// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47254// This is more or less the reverse of combineBitcastvxi1.
47256 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47257 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47258 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47259 Opcode != ISD::ANY_EXTEND)
47260 return SDValue();
47261 if (!DCI.isBeforeLegalizeOps())
47262 return SDValue();
47263 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47264 return SDValue();
47265
47266 EVT SVT = VT.getScalarType();
47267 EVT InSVT = N0.getValueType().getScalarType();
47268 unsigned EltSizeInBits = SVT.getSizeInBits();
47269
47270 // Input type must be extending a bool vector (bit-casted from a scalar
47271 // integer) to legal integer types.
47272 if (!VT.isVector())
47273 return SDValue();
47274 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47275 return SDValue();
47276 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47277 return SDValue();
47278
47279 SDValue N00 = N0.getOperand(0);
47280 EVT SclVT = N00.getValueType();
47281 if (!SclVT.isScalarInteger())
47282 return SDValue();
47283
47284 SDValue Vec;
47285 SmallVector<int> ShuffleMask;
47286 unsigned NumElts = VT.getVectorNumElements();
47287 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47288
47289 // Broadcast the scalar integer to the vector elements.
47290 if (NumElts > EltSizeInBits) {
47291 // If the scalar integer is greater than the vector element size, then we
47292 // must split it down into sub-sections for broadcasting. For example:
47293 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47294 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47295 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47296 unsigned Scale = NumElts / EltSizeInBits;
47297 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47298 bool UseBroadcast = Subtarget.hasInt256() &&
47299 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47300 Vec = UseBroadcast
47301 ? DAG.getSplat(BroadcastVT, DL, N00)
47302 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47303 Vec = DAG.getBitcast(VT, Vec);
47304
47305 for (unsigned i = 0; i != Scale; ++i) {
47306 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47307 ShuffleMask.append(EltSizeInBits, i + Offset);
47308 }
47309 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47310 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47311 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47312 // If we have register broadcast instructions, use the scalar size as the
47313 // element type for the shuffle. Then cast to the wider element type. The
47314 // widened bits won't be used, and this might allow the use of a broadcast
47315 // load.
47316 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47317 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47318 (NumElts * EltSizeInBits) / NumElts);
47319 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47320 } else {
47321 // For smaller scalar integers, we can simply any-extend it to the vector
47322 // element size (we don't care about the upper bits) and broadcast it to all
47323 // elements.
47324 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47325 }
47326
47327 // Now, mask the relevant bit in each element.
47329 for (unsigned i = 0; i != NumElts; ++i) {
47330 int BitIdx = (i % EltSizeInBits);
47331 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47332 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47333 }
47334 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47335 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47336
47337 // Compare against the bitmask and extend the result.
47338 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47339 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47340 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47341
47342 // For SEXT, this is now done, otherwise shift the result down for
47343 // zero-extension.
47344 if (Opcode == ISD::SIGN_EXTEND)
47345 return Vec;
47346 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47347 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47348}
47349
47350/// If both arms of a vector select are concatenated vectors, split the select,
47351/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47352/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47353/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47355 const X86Subtarget &Subtarget) {
47356 unsigned Opcode = N->getOpcode();
47357 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47358 return SDValue();
47359
47360 // TODO: Split 512-bit vectors too?
47361 EVT VT = N->getValueType(0);
47362 if (!VT.is256BitVector())
47363 return SDValue();
47364
47365 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47366 SDValue Cond = N->getOperand(0);
47367 SDValue TVal = N->getOperand(1);
47368 SDValue FVal = N->getOperand(2);
47369 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47370 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47371 return SDValue();
47372
47373 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47374 ArrayRef<SDValue> Ops) {
47375 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47376 };
47377 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47378 /*CheckBWI*/ false);
47379}
47380
47382 const SDLoc &DL) {
47383 SDValue Cond = N->getOperand(0);
47384 SDValue LHS = N->getOperand(1);
47385 SDValue RHS = N->getOperand(2);
47386
47387 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47388 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47389 if (!TrueC || !FalseC)
47390 return SDValue();
47391
47392 // Don't do this for crazy integer types.
47393 EVT VT = N->getValueType(0);
47394 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47395 return SDValue();
47396
47397 // We're going to use the condition bit in math or logic ops. We could allow
47398 // this with a wider condition value (post-legalization it becomes an i8),
47399 // but if nothing is creating selects that late, it doesn't matter.
47400 if (Cond.getValueType() != MVT::i1)
47401 return SDValue();
47402
47403 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47404 // 3, 5, or 9 with i32/i64, so those get transformed too.
47405 // TODO: For constants that overflow or do not differ by power-of-2 or small
47406 // multiplier, convert to 'and' + 'add'.
47407 const APInt &TrueVal = TrueC->getAPIntValue();
47408 const APInt &FalseVal = FalseC->getAPIntValue();
47409
47410 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47411 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47412 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47413 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47414 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47415 return SDValue();
47416 }
47417
47418 bool OV;
47419 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47420 if (OV)
47421 return SDValue();
47422
47423 APInt AbsDiff = Diff.abs();
47424 if (AbsDiff.isPowerOf2() ||
47425 ((VT == MVT::i32 || VT == MVT::i64) &&
47426 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47427
47428 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47429 // of the condition can usually be folded into a compare predicate, but even
47430 // without that, the sequence should be cheaper than a CMOV alternative.
47431 if (TrueVal.slt(FalseVal)) {
47432 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47433 std::swap(TrueC, FalseC);
47434 }
47435
47436 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47437 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47438
47439 // Multiply condition by the difference if non-one.
47440 if (!AbsDiff.isOne())
47441 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47442
47443 // Add the base if non-zero.
47444 if (!FalseC->isZero())
47445 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47446
47447 return R;
47448 }
47449
47450 return SDValue();
47451}
47452
47453/// If this is a *dynamic* select (non-constant condition) and we can match
47454/// this node with one of the variable blend instructions, restructure the
47455/// condition so that blends can use the high (sign) bit of each element.
47456/// This function will also call SimplifyDemandedBits on already created
47457/// BLENDV to perform additional simplifications.
47459 const SDLoc &DL,
47461 const X86Subtarget &Subtarget) {
47462 SDValue Cond = N->getOperand(0);
47463 if ((N->getOpcode() != ISD::VSELECT &&
47464 N->getOpcode() != X86ISD::BLENDV) ||
47466 return SDValue();
47467
47468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47469 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47470 EVT VT = N->getValueType(0);
47471
47472 // We can only handle the cases where VSELECT is directly legal on the
47473 // subtarget. We custom lower VSELECT nodes with constant conditions and
47474 // this makes it hard to see whether a dynamic VSELECT will correctly
47475 // lower, so we both check the operation's status and explicitly handle the
47476 // cases where a *dynamic* blend will fail even though a constant-condition
47477 // blend could be custom lowered.
47478 // FIXME: We should find a better way to handle this class of problems.
47479 // Potentially, we should combine constant-condition vselect nodes
47480 // pre-legalization into shuffles and not mark as many types as custom
47481 // lowered.
47483 return SDValue();
47484 // FIXME: We don't support i16-element blends currently. We could and
47485 // should support them by making *all* the bits in the condition be set
47486 // rather than just the high bit and using an i8-element blend.
47487 if (VT.getVectorElementType() == MVT::i16)
47488 return SDValue();
47489 // Dynamic blending was only available from SSE4.1 onward.
47490 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47491 return SDValue();
47492 // Byte blends are only available in AVX2
47493 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47494 return SDValue();
47495 // There are no 512-bit blend instructions that use sign bits.
47496 if (VT.is512BitVector())
47497 return SDValue();
47498
47499 // Don't optimize before the condition has been transformed to a legal type
47500 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47501 if (BitWidth < 8 || BitWidth > 64)
47502 return SDValue();
47503
47504 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47505 for (SDUse &Use : Cond->uses())
47506 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47507 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47508 Use.getOperandNo() != 0)
47509 return false;
47510
47511 return true;
47512 };
47513
47515
47516 if (OnlyUsedAsSelectCond(Cond)) {
47517 KnownBits Known;
47519 !DCI.isBeforeLegalizeOps());
47520 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47521 return SDValue();
47522
47523 // If we changed the computation somewhere in the DAG, this change will
47524 // affect all users of Cond. Update all the nodes so that we do not use
47525 // the generic VSELECT anymore. Otherwise, we may perform wrong
47526 // optimizations as we messed with the actual expectation for the vector
47527 // boolean values.
47528 for (SDNode *U : Cond->users()) {
47529 if (U->getOpcode() == X86ISD::BLENDV)
47530 continue;
47531
47532 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47533 Cond, U->getOperand(1), U->getOperand(2));
47534 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47535 DCI.AddToWorklist(U);
47536 }
47537 DCI.CommitTargetLoweringOpt(TLO);
47538 return SDValue(N, 0);
47539 }
47540
47541 // Otherwise we can still at least try to simplify multiple use bits.
47543 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47544 N->getOperand(1), N->getOperand(2));
47545
47546 return SDValue();
47547}
47548
47549// Try to match:
47550// (or (and (M, (sub 0, X)), (pandn M, X)))
47551// which is a special case of:
47552// (select M, (sub 0, X), X)
47553// Per:
47554// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47555// We know that, if fNegate is 0 or 1:
47556// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47557//
47558// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47559// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47560// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47561// This lets us transform our vselect to:
47562// (add (xor X, M), (and M, 1))
47563// And further to:
47564// (sub (xor X, M), M)
47566 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47567 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47568 using namespace SDPatternMatch;
47569 EVT MaskVT = Mask.getValueType();
47570 assert(MaskVT.isInteger() &&
47571 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47572 "Mask must be zero/all-bits");
47573
47574 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47576 return SDValue();
47577
47578 SDValue V;
47579 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47580 !sd_match(X, m_Neg(m_AllOf(m_Specific(Y), m_Value(V)))))
47581 return SDValue();
47582
47583 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47584 SDValue SubOp2 = Mask;
47585
47586 // If the negate was on the false side of the select, then
47587 // the operands of the SUB need to be swapped. PR 27251.
47588 // This is because the pattern being matched above is
47589 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47590 // but if the pattern matched was
47591 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47592 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47593 // pattern also needs to be a negation of the replacement pattern above.
47594 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47595 // sub accomplishes the negation of the replacement pattern.
47596 if (V == Y)
47597 std::swap(SubOp1, SubOp2);
47598
47599 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47600 return DAG.getBitcast(VT, Res);
47601}
47602
47604 const X86Subtarget &Subtarget) {
47605 using namespace SDPatternMatch;
47606 if (!Subtarget.hasAVX512())
47607 return SDValue();
47608
47609 ISD::CondCode CC;
47610 SDValue Cond, X, Y, LHS, RHS;
47611 if (!sd_match(N, m_VSelect(m_AllOf(m_Value(Cond),
47612 m_OneUse(m_SetCC(m_Value(X), m_Value(Y),
47613 m_CondCode(CC)))),
47614 m_Value(LHS), m_Value(RHS))))
47615 return SDValue();
47616
47617 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47618 !canCombineAsMaskOperation(RHS, Subtarget))
47619 return SDValue();
47620
47621 // Commute LHS and RHS to create opportunity to select mask instruction.
47622 // (vselect M, L, R) -> (vselect ~M, R, L)
47623 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47624 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47625 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47626}
47627
47628/// Do target-specific dag combines on SELECT and VSELECT nodes.
47631 const X86Subtarget &Subtarget) {
47632 SDLoc DL(N);
47633 SDValue Cond = N->getOperand(0);
47634 SDValue LHS = N->getOperand(1);
47635 SDValue RHS = N->getOperand(2);
47636
47637 // Try simplification again because we use this function to optimize
47638 // BLENDV nodes that are not handled by the generic combiner.
47639 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47640 return V;
47641
47642 // When avx512 is available the lhs operand of select instruction can be
47643 // folded with mask instruction, while the rhs operand can't. Commute the
47644 // lhs and rhs of the select instruction to create the opportunity of
47645 // folding.
47646 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47647 return V;
47648
47649 EVT VT = LHS.getValueType();
47650 EVT CondVT = Cond.getValueType();
47651 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47652 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47653
47654 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47655 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47656 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47657 if (CondVT.isVector() && CondVT.isInteger() &&
47658 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47659 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47662 DL, DAG, Subtarget))
47663 return V;
47664
47665 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47666 SmallVector<int, 64> CondMask;
47667 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47668 N->getOpcode() == X86ISD::BLENDV)) {
47669 // Convert vselects with constant condition into shuffles.
47670 if (DCI.isBeforeLegalizeOps())
47671 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47672
47673 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47674 // by forcing the unselected elements to zero.
47675 // TODO: Can we handle more shuffles with this?
47676 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47677 SmallVector<SDValue, 1> LHSOps, RHSOps;
47678 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47681 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47682 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47683 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47684 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47685 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47686 assert(ByteMask.size() == LHSMask.size() &&
47687 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47688 for (auto [I, M] : enumerate(ByteMask)) {
47689 // getConstVector sets negative shuffle mask values as undef, so
47690 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47691 if (M < (int)ByteMask.size()) {
47692 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47693 RHSMask[I] = 0x80;
47694 } else {
47695 LHSMask[I] = 0x80;
47696 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47697 }
47698 }
47699 MVT ByteVT = LHSShuf.getSimpleValueType();
47700 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47701 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47702 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47703 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47704 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47705 }
47706 }
47707
47708 // Attempt to combine as shuffle.
47709 SDValue Op(N, 0);
47710 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47711 return Res;
47712 }
47713 }
47714
47715 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47716 // instructions match the semantics of the common C idiom x<y?x:y but not
47717 // x<=y?x:y, because of how they handle negative zero (which can be
47718 // ignored in unsafe-math mode).
47719 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47720 if ((Cond.getOpcode() == ISD::SETCC ||
47721 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47722 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47723 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47724 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47725 (Subtarget.hasSSE2() ||
47726 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47727 bool IsStrict = Cond->isStrictFPOpcode();
47728 ISD::CondCode CC =
47729 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47730 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47731 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47732
47733 unsigned Opcode = 0;
47734 // Check for x CC y ? x : y.
47735 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47736 switch (CC) {
47737 default: break;
47738 case ISD::SETULT:
47739 // Converting this to a min would handle NaNs incorrectly, and swapping
47740 // the operands would cause it to handle comparisons between positive
47741 // and negative zero incorrectly.
47742 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47744 !(DAG.isKnownNeverZeroFloat(LHS) ||
47746 break;
47747 std::swap(LHS, RHS);
47748 }
47749 Opcode = X86ISD::FMIN;
47750 break;
47751 case ISD::SETOLE:
47752 // Converting this to a min would handle comparisons between positive
47753 // and negative zero incorrectly.
47756 break;
47757 Opcode = X86ISD::FMIN;
47758 break;
47759 case ISD::SETULE:
47760 // Converting this to a min would handle both negative zeros and NaNs
47761 // incorrectly, but we can swap the operands to fix both.
47762 std::swap(LHS, RHS);
47763 [[fallthrough]];
47764 case ISD::SETOLT:
47765 case ISD::SETLT:
47766 case ISD::SETLE:
47767 Opcode = X86ISD::FMIN;
47768 break;
47769
47770 case ISD::SETOGE:
47771 // Converting this to a max would handle comparisons between positive
47772 // and negative zero incorrectly.
47775 break;
47776 Opcode = X86ISD::FMAX;
47777 break;
47778 case ISD::SETUGT:
47779 // Converting this to a max would handle NaNs incorrectly, and swapping
47780 // the operands would cause it to handle comparisons between positive
47781 // and negative zero incorrectly.
47782 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47784 !(DAG.isKnownNeverZeroFloat(LHS) ||
47786 break;
47787 std::swap(LHS, RHS);
47788 }
47789 Opcode = X86ISD::FMAX;
47790 break;
47791 case ISD::SETUGE:
47792 // Converting this to a max would handle both negative zeros and NaNs
47793 // incorrectly, but we can swap the operands to fix both.
47794 std::swap(LHS, RHS);
47795 [[fallthrough]];
47796 case ISD::SETOGT:
47797 case ISD::SETGT:
47798 case ISD::SETGE:
47799 Opcode = X86ISD::FMAX;
47800 break;
47801 }
47802 // Check for x CC y ? y : x -- a min/max with reversed arms.
47803 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47804 switch (CC) {
47805 default: break;
47806 case ISD::SETOGE:
47807 // Converting this to a min would handle comparisons between positive
47808 // and negative zero incorrectly, and swapping the operands would
47809 // cause it to handle NaNs incorrectly.
47811 !(DAG.isKnownNeverZeroFloat(LHS) ||
47812 DAG.isKnownNeverZeroFloat(RHS))) {
47813 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47814 break;
47815 std::swap(LHS, RHS);
47816 }
47817 Opcode = X86ISD::FMIN;
47818 break;
47819 case ISD::SETUGT:
47820 // Converting this to a min would handle NaNs incorrectly.
47821 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47822 break;
47823 Opcode = X86ISD::FMIN;
47824 break;
47825 case ISD::SETUGE:
47826 // Converting this to a min would handle both negative zeros and NaNs
47827 // incorrectly, but we can swap the operands to fix both.
47828 std::swap(LHS, RHS);
47829 [[fallthrough]];
47830 case ISD::SETOGT:
47831 case ISD::SETGT:
47832 case ISD::SETGE:
47833 Opcode = X86ISD::FMIN;
47834 break;
47835
47836 case ISD::SETULT:
47837 // Converting this to a max would handle NaNs incorrectly.
47838 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47839 break;
47840 Opcode = X86ISD::FMAX;
47841 break;
47842 case ISD::SETOLE:
47843 // Converting this to a max would handle comparisons between positive
47844 // and negative zero incorrectly, and swapping the operands would
47845 // cause it to handle NaNs incorrectly.
47847 !DAG.isKnownNeverZeroFloat(LHS) &&
47848 !DAG.isKnownNeverZeroFloat(RHS)) {
47849 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47850 break;
47851 std::swap(LHS, RHS);
47852 }
47853 Opcode = X86ISD::FMAX;
47854 break;
47855 case ISD::SETULE:
47856 // Converting this to a max would handle both negative zeros and NaNs
47857 // incorrectly, but we can swap the operands to fix both.
47858 std::swap(LHS, RHS);
47859 [[fallthrough]];
47860 case ISD::SETOLT:
47861 case ISD::SETLT:
47862 case ISD::SETLE:
47863 Opcode = X86ISD::FMAX;
47864 break;
47865 }
47866 }
47867
47868 if (Opcode) {
47869 if (IsStrict) {
47870 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47872 DL, {N->getValueType(0), MVT::Other},
47873 {Cond.getOperand(0), LHS, RHS});
47874 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47875 return Ret;
47876 }
47877 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47878 }
47879 }
47880
47881 // Some mask scalar intrinsics rely on checking if only one bit is set
47882 // and implement it in C code like this:
47883 // A[0] = (U & 1) ? A[0] : W[0];
47884 // This creates some redundant instructions that break pattern matching.
47885 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47886 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47887 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47888 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47889 SDValue AndNode = Cond.getOperand(0);
47890 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47891 isNullConstant(Cond.getOperand(1)) &&
47892 isOneConstant(AndNode.getOperand(1))) {
47893 // LHS and RHS swapped due to
47894 // setcc outputting 1 when AND resulted in 0 and vice versa.
47895 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47896 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47897 }
47898 }
47899
47900 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47901 // lowering on KNL. In this case we convert it to
47902 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47903 // The same situation all vectors of i8 and i16 without BWI.
47904 // Make sure we extend these even before type legalization gets a chance to
47905 // split wide vectors.
47906 // Since SKX these selects have a proper lowering.
47907 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47908 CondVT.getVectorElementType() == MVT::i1 &&
47909 (VT.getVectorElementType() == MVT::i8 ||
47910 VT.getVectorElementType() == MVT::i16)) {
47911 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47912 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47913 }
47914
47915 // AVX512 - Extend select to merge with target shuffle.
47916 // select(mask, extract_subvector(shuffle(x)), y) -->
47917 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47918 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47919 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47920 CondVT.getVectorElementType() == MVT::i1) {
47921 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47922 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47923 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47924 isNullConstant(Op.getOperand(1)) &&
47925 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47926 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47927 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47928 ISD::isBuildVectorAllZeros(Alt.getNode()));
47929 };
47930
47931 bool SelectableLHS = SelectableOp(LHS, RHS);
47932 bool SelectableRHS = SelectableOp(RHS, LHS);
47933 if (SelectableLHS || SelectableRHS) {
47934 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47935 : RHS.getOperand(0).getValueType();
47936 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47937 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47938 VT.getSizeInBits());
47939 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47940 VT.getSizeInBits());
47941 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47942 DAG.getUNDEF(SrcCondVT), Cond,
47943 DAG.getVectorIdxConstant(0, DL));
47944 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47945 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47946 }
47947 }
47948
47949 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
47950 return V;
47951
47952 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47953 Cond.hasOneUse()) {
47954 EVT CondVT = Cond.getValueType();
47955 SDValue Cond0 = Cond.getOperand(0);
47956 SDValue Cond1 = Cond.getOperand(1);
47957 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47958
47959 // Canonicalize min/max:
47960 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47961 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47962 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47963 // the need for an extra compare against zero. e.g.
47964 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47965 // subl %esi, %edi
47966 // testl %edi, %edi
47967 // movl $0, %eax
47968 // cmovgl %edi, %eax
47969 // =>
47970 // xorl %eax, %eax
47971 // subl %esi, $edi
47972 // cmovsl %eax, %edi
47973 //
47974 // We can also canonicalize
47975 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
47976 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
47977 // This allows the use of a test instruction for the compare.
47978 if (LHS == Cond0 && RHS == Cond1) {
47979 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
47980 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
47982 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47983 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47984 }
47985 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
47986 ISD::CondCode NewCC = ISD::SETUGE;
47987 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47988 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47989 }
47990 }
47991
47992 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
47993 // fold eq + gt/lt nested selects into ge/le selects
47994 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
47995 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47996 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
47997 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47998 // .. etc ..
47999 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48000 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48001 SDValue InnerSetCC = RHS.getOperand(0);
48002 ISD::CondCode InnerCC =
48003 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48004 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48005 Cond0 == InnerSetCC.getOperand(0) &&
48006 Cond1 == InnerSetCC.getOperand(1)) {
48007 ISD::CondCode NewCC;
48008 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48009 // clang-format off
48010 case ISD::SETGT: NewCC = ISD::SETGE; break;
48011 case ISD::SETLT: NewCC = ISD::SETLE; break;
48012 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48013 case ISD::SETULT: NewCC = ISD::SETULE; break;
48014 default: NewCC = ISD::SETCC_INVALID; break;
48015 // clang-format on
48016 }
48017 if (NewCC != ISD::SETCC_INVALID) {
48018 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48019 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48020 }
48021 }
48022 }
48023 }
48024
48025 // Check if the first operand is all zeros and Cond type is vXi1.
48026 // If this an avx512 target we can improve the use of zero masking by
48027 // swapping the operands and inverting the condition.
48028 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48029 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48030 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48031 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48032 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48033 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48034 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48035 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48036 }
48037
48038 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48039 // get split by legalization.
48040 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48041 CondVT.getVectorElementType() == MVT::i1 &&
48042 TLI.isTypeLegal(VT.getScalarType())) {
48043 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48045 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48046 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48047 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48048 }
48049 }
48050
48051 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48052 // with out-of-bounds clamping.
48053
48054 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48055 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48056 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48057 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48058 // exceeding bitwidth-1.
48059 if (N->getOpcode() == ISD::VSELECT) {
48060 using namespace llvm::SDPatternMatch;
48061 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48062 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48063 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48064 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48066 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48068 m_SpecificCondCode(ISD::SETULT)))) {
48069 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48070 : X86ISD::VSHLV,
48071 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48072 }
48073 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48074 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48075 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48076 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48078 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48080 m_SpecificCondCode(ISD::SETUGE)))) {
48081 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48082 : X86ISD::VSHLV,
48083 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48084 }
48085 }
48086
48087 // Early exit check
48088 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48089 return SDValue();
48090
48091 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48092 return V;
48093
48094 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48095 return V;
48096
48097 // select(~Cond, X, Y) -> select(Cond, Y, X)
48098 if (CondVT.getScalarType() != MVT::i1) {
48099 if (SDValue CondNot = IsNOT(Cond, DAG))
48100 return DAG.getNode(N->getOpcode(), DL, VT,
48101 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48102
48103 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48104 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48105 Cond.getOperand(0).getOpcode() == ISD::AND &&
48106 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48107 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48108 Cond.getScalarValueSizeInBits(),
48109 /*AllowUndefs=*/true) &&
48110 Cond.hasOneUse()) {
48111 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48112 Cond.getOperand(0).getOperand(1));
48113 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48114 }
48115
48116 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48117 // signbit.
48118 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48119 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48120 Cond.hasOneUse()) {
48121 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48122 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48123 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48124 }
48125 }
48126
48127 // Try to optimize vXi1 selects if both operands are either all constants or
48128 // bitcasts from scalar integer type. In that case we can convert the operands
48129 // to integer and use an integer select which will be converted to a CMOV.
48130 // We need to take a little bit of care to avoid creating an i64 type after
48131 // type legalization.
48132 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48133 VT.getVectorElementType() == MVT::i1 &&
48134 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48136 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48137 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48138 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48139
48140 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48141 LHS.getOperand(0).getValueType() == IntVT)) &&
48142 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48143 RHS.getOperand(0).getValueType() == IntVT))) {
48144 if (LHSIsConst)
48146 else
48147 LHS = LHS.getOperand(0);
48148
48149 if (RHSIsConst)
48151 else
48152 RHS = RHS.getOperand(0);
48153
48154 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48155 return DAG.getBitcast(VT, Select);
48156 }
48157 }
48158 }
48159
48160 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48161 // single bits, then invert the predicate and swap the select operands.
48162 // This can lower using a vector shift bit-hack rather than mask and compare.
48163 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48164 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48165 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48166 Cond.getOperand(0).getOpcode() == ISD::AND &&
48167 isNullOrNullSplat(Cond.getOperand(1)) &&
48168 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48169 Cond.getOperand(0).getValueType() == VT) {
48170 // The 'and' mask must be composed of power-of-2 constants.
48171 SDValue And = Cond.getOperand(0);
48172 auto *C = isConstOrConstSplat(And.getOperand(1));
48173 if (C && C->getAPIntValue().isPowerOf2()) {
48174 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48175 SDValue NotCond =
48176 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48177 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48178 }
48179
48180 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48181 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48182 // 16-bit lacks a proper blendv.
48183 unsigned EltBitWidth = VT.getScalarSizeInBits();
48184 bool CanShiftBlend =
48185 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48186 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48187 (Subtarget.hasXOP()));
48188 if (CanShiftBlend &&
48189 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48190 return C->getAPIntValue().isPowerOf2();
48191 })) {
48192 // Create a left-shift constant to get the mask bits over to the sign-bit.
48193 SDValue Mask = And.getOperand(1);
48194 SmallVector<int, 32> ShlVals;
48195 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48196 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48197 ShlVals.push_back(EltBitWidth - 1 -
48198 MaskVal->getAPIntValue().exactLogBase2());
48199 }
48200 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48201 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48202 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48203 SDValue NewCond =
48204 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48205 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48206 }
48207 }
48208
48209 return SDValue();
48210}
48211
48212/// Combine:
48213/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48214/// to:
48215/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48216/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48217/// Note that this is only legal for some op/cc combinations.
48219 SelectionDAG &DAG,
48220 const X86Subtarget &Subtarget) {
48221 // This combine only operates on CMP-like nodes.
48222 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48223 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48224 return SDValue();
48225
48226 // Can't replace the cmp if it has more uses than the one we're looking at.
48227 // FIXME: We would like to be able to handle this, but would need to make sure
48228 // all uses were updated.
48229 if (!Cmp.hasOneUse())
48230 return SDValue();
48231
48232 // This only applies to variations of the common case:
48233 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48234 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48235 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48236 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48237 // Using the proper condcodes (see below), overflow is checked for.
48238
48239 // FIXME: We can generalize both constraints:
48240 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48241 // - LHS != 1
48242 // if the result is compared.
48243
48244 SDValue CmpLHS = Cmp.getOperand(0);
48245 SDValue CmpRHS = Cmp.getOperand(1);
48246 EVT CmpVT = CmpLHS.getValueType();
48247
48248 if (!CmpLHS.hasOneUse())
48249 return SDValue();
48250
48251 unsigned Opc = CmpLHS.getOpcode();
48253 return SDValue();
48254
48255 SDValue OpRHS = CmpLHS.getOperand(2);
48256 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48257 if (!OpRHSC)
48258 return SDValue();
48259
48260 APInt Addend = OpRHSC->getAPIntValue();
48262 Addend = -Addend;
48263
48264 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48265 if (!CmpRHSC)
48266 return SDValue();
48267
48268 APInt Comparison = CmpRHSC->getAPIntValue();
48269 APInt NegAddend = -Addend;
48270
48271 // See if we can adjust the CC to make the comparison match the negated
48272 // addend.
48273 if (Comparison != NegAddend) {
48274 APInt IncComparison = Comparison + 1;
48275 if (IncComparison == NegAddend) {
48276 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48277 Comparison = IncComparison;
48278 CC = X86::COND_AE;
48279 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48280 Comparison = IncComparison;
48281 CC = X86::COND_L;
48282 }
48283 }
48284 APInt DecComparison = Comparison - 1;
48285 if (DecComparison == NegAddend) {
48286 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48287 Comparison = DecComparison;
48288 CC = X86::COND_A;
48289 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48290 Comparison = DecComparison;
48291 CC = X86::COND_LE;
48292 }
48293 }
48294 }
48295
48296 // If the addend is the negation of the comparison value, then we can do
48297 // a full comparison by emitting the atomic arithmetic as a locked sub.
48298 if (Comparison == NegAddend) {
48299 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48300 // atomic sub.
48301 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48302 auto AtomicSub = DAG.getAtomic(
48303 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48304 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48305 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48306 AN->getMemOperand());
48307 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48308 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48309 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48310 return LockOp;
48311 }
48312
48313 // We can handle comparisons with zero in a number of cases by manipulating
48314 // the CC used.
48315 if (!Comparison.isZero())
48316 return SDValue();
48317
48318 if (CC == X86::COND_S && Addend == 1)
48319 CC = X86::COND_LE;
48320 else if (CC == X86::COND_NS && Addend == 1)
48321 CC = X86::COND_G;
48322 else if (CC == X86::COND_G && Addend == -1)
48323 CC = X86::COND_GE;
48324 else if (CC == X86::COND_LE && Addend == -1)
48325 CC = X86::COND_L;
48326 else
48327 return SDValue();
48328
48329 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48330 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48331 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48332 return LockOp;
48333}
48334
48335// Check whether we're just testing the signbit, and whether we can simplify
48336// this by tracking where the signbit came from.
48338 SelectionDAG &DAG) {
48339 if (CC != X86::COND_S && CC != X86::COND_NS)
48340 return SDValue();
48341
48342 if (!Cmp.hasOneUse())
48343 return SDValue();
48344
48345 SDValue Src;
48346 if (Cmp.getOpcode() == X86ISD::CMP) {
48347 // CMP(X,0) -> signbit test
48348 if (!isNullConstant(Cmp.getOperand(1)))
48349 return SDValue();
48350 Src = Cmp.getOperand(0);
48351 // Peek through a SRA node as we just need the signbit.
48352 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48353 // TODO: Use SimplifyDemandedBits instead of just SRA?
48354 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48355 return SDValue();
48356 Src = Src.getOperand(0);
48357 } else if (Cmp.getOpcode() == X86ISD::OR) {
48358 // OR(X,Y) -> see if only one operand contributes to the signbit.
48359 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48360 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48361 Src = Cmp.getOperand(1);
48362 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48363 Src = Cmp.getOperand(0);
48364 else
48365 return SDValue();
48366 } else {
48367 return SDValue();
48368 }
48369
48370 // Replace with a TEST on the MSB.
48371 SDLoc DL(Cmp);
48372 MVT SrcVT = Src.getSimpleValueType();
48373 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48374
48375 // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
48376 // peek through and adjust the TEST bit.
48377 if (Src.getOpcode() == ISD::SHL) {
48378 if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48379 Src = Src.getOperand(0);
48380 BitMask.lshrInPlace(*ShiftAmt);
48381 }
48382 }
48383
48384 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48385 DAG.getConstant(BitMask, DL, SrcVT));
48386 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48387 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48388 DAG.getConstant(0, DL, SrcVT));
48389}
48390
48391// Check whether a boolean test is testing a boolean value generated by
48392// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48393// code.
48394//
48395// Simplify the following patterns:
48396// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48397// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48398// to (Op EFLAGS Cond)
48399//
48400// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48401// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48402// to (Op EFLAGS !Cond)
48403//
48404// where Op could be BRCOND or CMOV.
48405//
48407 // This combine only operates on CMP-like nodes.
48408 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48409 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48410 return SDValue();
48411
48412 // Quit if not used as a boolean value.
48413 if (CC != X86::COND_E && CC != X86::COND_NE)
48414 return SDValue();
48415
48416 // Check CMP operands. One of them should be 0 or 1 and the other should be
48417 // an SetCC or extended from it.
48418 SDValue Op1 = Cmp.getOperand(0);
48419 SDValue Op2 = Cmp.getOperand(1);
48420
48421 SDValue SetCC;
48422 const ConstantSDNode* C = nullptr;
48423 bool needOppositeCond = (CC == X86::COND_E);
48424 bool checkAgainstTrue = false; // Is it a comparison against 1?
48425
48426 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48427 SetCC = Op2;
48428 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48429 SetCC = Op1;
48430 else // Quit if all operands are not constants.
48431 return SDValue();
48432
48433 if (C->getZExtValue() == 1) {
48434 needOppositeCond = !needOppositeCond;
48435 checkAgainstTrue = true;
48436 } else if (C->getZExtValue() != 0)
48437 // Quit if the constant is neither 0 or 1.
48438 return SDValue();
48439
48440 bool truncatedToBoolWithAnd = false;
48441 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48442 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48443 SetCC.getOpcode() == ISD::TRUNCATE ||
48444 SetCC.getOpcode() == ISD::AND) {
48445 if (SetCC.getOpcode() == ISD::AND) {
48446 int OpIdx = -1;
48447 if (isOneConstant(SetCC.getOperand(0)))
48448 OpIdx = 1;
48449 if (isOneConstant(SetCC.getOperand(1)))
48450 OpIdx = 0;
48451 if (OpIdx < 0)
48452 break;
48453 SetCC = SetCC.getOperand(OpIdx);
48454 truncatedToBoolWithAnd = true;
48455 } else
48456 SetCC = SetCC.getOperand(0);
48457 }
48458
48459 switch (SetCC.getOpcode()) {
48461 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48462 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48463 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48464 // truncated to i1 using 'and'.
48465 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48466 break;
48468 "Invalid use of SETCC_CARRY!");
48469 [[fallthrough]];
48470 case X86ISD::SETCC:
48471 // Set the condition code or opposite one if necessary.
48472 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48473 if (needOppositeCond)
48475 return SetCC.getOperand(1);
48476 case X86ISD::CMOV: {
48477 // Check whether false/true value has canonical one, i.e. 0 or 1.
48478 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
48479 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
48480 // Quit if true value is not a constant.
48481 if (!TVal)
48482 return SDValue();
48483 // Quit if false value is not a constant.
48484 if (!FVal) {
48485 SDValue Op = SetCC.getOperand(0);
48486 // Skip 'zext' or 'trunc' node.
48487 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48488 Op.getOpcode() == ISD::TRUNCATE)
48489 Op = Op.getOperand(0);
48490 // A special case for rdrand/rdseed, where 0 is set if false cond is
48491 // found.
48492 if ((Op.getOpcode() != X86ISD::RDRAND &&
48493 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48494 return SDValue();
48495 }
48496 // Quit if false value is not the constant 0 or 1.
48497 bool FValIsFalse = true;
48498 if (FVal && FVal->getZExtValue() != 0) {
48499 if (FVal->getZExtValue() != 1)
48500 return SDValue();
48501 // If FVal is 1, opposite cond is needed.
48502 needOppositeCond = !needOppositeCond;
48503 FValIsFalse = false;
48504 }
48505 // Quit if TVal is not the constant opposite of FVal.
48506 if (FValIsFalse && TVal->getZExtValue() != 1)
48507 return SDValue();
48508 if (!FValIsFalse && TVal->getZExtValue() != 0)
48509 return SDValue();
48510 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48511 if (needOppositeCond)
48513 return SetCC.getOperand(3);
48514 }
48515 }
48516
48517 return SDValue();
48518}
48519
48520/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48521/// Match:
48522/// (X86or (X86setcc) (X86setcc))
48523/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48525 X86::CondCode &CC1, SDValue &Flags,
48526 bool &isAnd) {
48527 if (Cond->getOpcode() == X86ISD::CMP) {
48528 if (!isNullConstant(Cond->getOperand(1)))
48529 return false;
48530
48531 Cond = Cond->getOperand(0);
48532 }
48533
48534 isAnd = false;
48535
48536 SDValue SetCC0, SetCC1;
48537 switch (Cond->getOpcode()) {
48538 default: return false;
48539 case ISD::AND:
48540 case X86ISD::AND:
48541 isAnd = true;
48542 [[fallthrough]];
48543 case ISD::OR:
48544 case X86ISD::OR:
48545 SetCC0 = Cond->getOperand(0);
48546 SetCC1 = Cond->getOperand(1);
48547 break;
48548 };
48549
48550 // Make sure we have SETCC nodes, using the same flags value.
48551 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48552 SetCC1.getOpcode() != X86ISD::SETCC ||
48553 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48554 return false;
48555
48556 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48557 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48558 Flags = SetCC0->getOperand(1);
48559 return true;
48560}
48561
48562// When legalizing carry, we create carries via add X, -1
48563// If that comes from an actual carry, via setcc, we use the
48564// carry directly.
48566 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48567 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48568 bool FoundAndLSB = false;
48569 SDValue Carry = EFLAGS.getOperand(0);
48570 while (Carry.getOpcode() == ISD::TRUNCATE ||
48571 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48572 (Carry.getOpcode() == ISD::AND &&
48573 isOneConstant(Carry.getOperand(1)))) {
48574 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48575 Carry = Carry.getOperand(0);
48576 }
48577 if (Carry.getOpcode() == X86ISD::SETCC ||
48578 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48579 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48580 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48581 SDValue CarryOp1 = Carry.getOperand(1);
48582 if (CarryCC == X86::COND_B)
48583 return CarryOp1;
48584 if (CarryCC == X86::COND_A) {
48585 // Try to convert COND_A into COND_B in an attempt to facilitate
48586 // materializing "setb reg".
48587 //
48588 // Do not flip "e > c", where "c" is a constant, because Cmp
48589 // instruction cannot take an immediate as its first operand.
48590 //
48591 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48592 CarryOp1.getNode()->hasOneUse() &&
48593 CarryOp1.getValueType().isInteger() &&
48594 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48595 SDValue SubCommute =
48596 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48597 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48598 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48599 }
48600 }
48601 // If this is a check of the z flag of an add with 1, switch to the
48602 // C flag.
48603 if (CarryCC == X86::COND_E &&
48604 CarryOp1.getOpcode() == X86ISD::ADD &&
48605 isOneConstant(CarryOp1.getOperand(1)))
48606 return CarryOp1;
48607 } else if (FoundAndLSB) {
48608 SDLoc DL(Carry);
48609 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48610 if (Carry.getOpcode() == ISD::SRL) {
48611 BitNo = Carry.getOperand(1);
48612 Carry = Carry.getOperand(0);
48613 }
48614 return getBT(Carry, BitNo, DL, DAG);
48615 }
48616 }
48617 }
48618
48619 return SDValue();
48620}
48621
48622/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48623/// to avoid the inversion.
48625 SelectionDAG &DAG,
48626 const X86Subtarget &Subtarget) {
48627 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48628 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48629 EFLAGS.getOpcode() != X86ISD::TESTP)
48630 return SDValue();
48631
48632 // PTEST/TESTP sets EFLAGS as:
48633 // TESTZ: ZF = (Op0 & Op1) == 0
48634 // TESTC: CF = (~Op0 & Op1) == 0
48635 // TESTNZC: ZF == 0 && CF == 0
48636 MVT VT = EFLAGS.getSimpleValueType();
48637 SDValue Op0 = EFLAGS.getOperand(0);
48638 SDValue Op1 = EFLAGS.getOperand(1);
48639 MVT OpVT = Op0.getSimpleValueType();
48640 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48641
48642 // TEST*(~X,Y) == TEST*(X,Y)
48643 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48644 X86::CondCode InvCC;
48645 switch (CC) {
48646 case X86::COND_B:
48647 // testc -> testz.
48648 InvCC = X86::COND_E;
48649 break;
48650 case X86::COND_AE:
48651 // !testc -> !testz.
48652 InvCC = X86::COND_NE;
48653 break;
48654 case X86::COND_E:
48655 // testz -> testc.
48656 InvCC = X86::COND_B;
48657 break;
48658 case X86::COND_NE:
48659 // !testz -> !testc.
48660 InvCC = X86::COND_AE;
48661 break;
48662 case X86::COND_A:
48663 case X86::COND_BE:
48664 // testnzc -> testnzc (no change).
48665 InvCC = CC;
48666 break;
48667 default:
48668 InvCC = X86::COND_INVALID;
48669 break;
48670 }
48671
48672 if (InvCC != X86::COND_INVALID) {
48673 CC = InvCC;
48674 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48675 DAG.getBitcast(OpVT, NotOp0), Op1);
48676 }
48677 }
48678
48679 if (CC == X86::COND_B || CC == X86::COND_AE) {
48680 // TESTC(X,~X) == TESTC(X,-1)
48681 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48682 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48683 SDLoc DL(EFLAGS);
48684 return DAG.getNode(
48685 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48686 DAG.getBitcast(OpVT,
48687 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48688 }
48689 }
48690 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48691 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48693 SDValue BC0 = peekThroughBitcasts(Op0);
48694 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48696 SDLoc DL(EFLAGS);
48697 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48698 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48699 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48700 }
48701 }
48702 }
48703
48704 if (CC == X86::COND_E || CC == X86::COND_NE) {
48705 // TESTZ(X,~Y) == TESTC(Y,X)
48706 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48707 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48708 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48709 DAG.getBitcast(OpVT, NotOp1), Op0);
48710 }
48711
48712 if (Op0 == Op1) {
48713 SDValue BC = peekThroughBitcasts(Op0);
48714 EVT BCVT = BC.getValueType();
48715
48716 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48717 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48718 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48719 DAG.getBitcast(OpVT, BC.getOperand(0)),
48720 DAG.getBitcast(OpVT, BC.getOperand(1)));
48721 }
48722
48723 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48724 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48725 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48726 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48727 DAG.getBitcast(OpVT, BC.getOperand(0)),
48728 DAG.getBitcast(OpVT, BC.getOperand(1)));
48729 }
48730
48731 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48732 // to more efficiently extract the sign bits and compare that.
48733 // TODO: Handle TESTC with comparison inversion.
48734 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48735 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48736 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48737 unsigned EltBits = BCVT.getScalarSizeInBits();
48738 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48739 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48740 APInt SignMask = APInt::getSignMask(EltBits);
48741 if (SDValue Res =
48742 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48743 // For vXi16 cases we need to use pmovmksb and extract every other
48744 // sign bit.
48745 SDLoc DL(EFLAGS);
48746 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48747 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48748 MVT FloatVT =
48749 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48750 Res = DAG.getBitcast(FloatVT, Res);
48751 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48752 } else if (EltBits == 16) {
48753 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48754 Res = DAG.getBitcast(MovmskVT, Res);
48755 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48756 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48757 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48758 } else {
48759 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48760 }
48761 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48762 DAG.getConstant(0, DL, MVT::i32));
48763 }
48764 }
48765 }
48766 }
48767
48768 // TESTZ(-1,X) == TESTZ(X,X)
48770 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48771
48772 // TESTZ(X,-1) == TESTZ(X,X)
48774 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48775
48776 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48777 // TODO: Add COND_NE handling?
48778 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48779 SDValue Src0 = peekThroughBitcasts(Op0);
48780 SDValue Src1 = peekThroughBitcasts(Op1);
48781 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48783 peekThroughBitcasts(Src0.getOperand(1)), true);
48785 peekThroughBitcasts(Src1.getOperand(1)), true);
48786 if (Src0 && Src1) {
48787 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48788 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48789 DAG.getBitcast(OpVT2, Src0),
48790 DAG.getBitcast(OpVT2, Src1));
48791 }
48792 }
48793 }
48794 }
48795
48796 return SDValue();
48797}
48798
48799// Attempt to simplify the MOVMSK input based on the comparison type.
48801 SelectionDAG &DAG,
48802 const X86Subtarget &Subtarget) {
48803 // Handle eq/ne against zero (any_of).
48804 // Handle eq/ne against -1 (all_of).
48805 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48806 return SDValue();
48807 if (EFLAGS.getValueType() != MVT::i32)
48808 return SDValue();
48809 unsigned CmpOpcode = EFLAGS.getOpcode();
48810 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48811 return SDValue();
48812 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48813 if (!CmpConstant)
48814 return SDValue();
48815 const APInt &CmpVal = CmpConstant->getAPIntValue();
48816
48817 SDValue CmpOp = EFLAGS.getOperand(0);
48818 unsigned CmpBits = CmpOp.getValueSizeInBits();
48819 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48820
48821 // Peek through any truncate.
48822 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48823 CmpOp = CmpOp.getOperand(0);
48824
48825 // Bail if we don't find a MOVMSK.
48826 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48827 return SDValue();
48828
48829 SDValue Vec = CmpOp.getOperand(0);
48830 MVT VecVT = Vec.getSimpleValueType();
48831 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48832 "Unexpected MOVMSK operand");
48833 unsigned NumElts = VecVT.getVectorNumElements();
48834 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48835
48836 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48837 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48838 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48839 if (!IsAnyOf && !IsAllOf)
48840 return SDValue();
48841
48842 // TODO: Check more combining cases for me.
48843 // Here we check the cmp use number to decide do combining or not.
48844 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48845 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48846 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48847
48848 // See if we can peek through to a vector with a wider element type, if the
48849 // signbits extend down to all the sub-elements as well.
48850 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48851 // potential SimplifyDemandedBits/Elts cases.
48852 // If we looked through a truncate that discard bits, we can't do this
48853 // transform.
48854 // FIXME: We could do this transform for truncates that discarded bits by
48855 // inserting an AND mask between the new MOVMSK and the CMP.
48856 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48857 SDValue BC = peekThroughBitcasts(Vec);
48858 MVT BCVT = BC.getSimpleValueType();
48859 unsigned BCNumElts = BCVT.getVectorNumElements();
48860 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48861 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48862 BCNumEltBits > NumEltBits &&
48863 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48864 SDLoc DL(EFLAGS);
48865 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48866 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48867 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48868 DAG.getConstant(CmpMask, DL, MVT::i32));
48869 }
48870 }
48871
48872 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48873 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48874 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48875 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48876 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48878 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48879 Ops.size() == 2) {
48880 SDLoc DL(EFLAGS);
48881 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48882 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48883 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48884 DAG.getBitcast(SubVT, Ops[0]),
48885 DAG.getBitcast(SubVT, Ops[1]));
48886 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48887 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48888 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48889 DAG.getConstant(CmpMask, DL, MVT::i32));
48890 }
48891 }
48892
48893 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48894 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48895 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48896 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48897 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48898 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48899 SDValue BC = peekThroughBitcasts(Vec);
48900 // Ensure MOVMSK was testing every signbit of BC.
48901 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48902 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48903 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48904 BC.getOperand(0), BC.getOperand(1));
48905 V = DAG.getBitcast(TestVT, V);
48906 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48907 }
48908 // Check for 256-bit split vector cases.
48909 if (BC.getOpcode() == ISD::AND &&
48910 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48911 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48912 SDValue LHS = BC.getOperand(0);
48913 SDValue RHS = BC.getOperand(1);
48914 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48915 LHS.getOperand(0), LHS.getOperand(1));
48916 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48917 RHS.getOperand(0), RHS.getOperand(1));
48918 LHS = DAG.getBitcast(TestVT, LHS);
48919 RHS = DAG.getBitcast(TestVT, RHS);
48920 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48921 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48922 }
48923 }
48924 }
48925
48926 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48927 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48928 // sign bits prior to the comparison with zero unless we know that
48929 // the vXi16 splats the sign bit down to the lower i8 half.
48930 // TODO: Handle all_of patterns.
48931 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48932 SDValue VecOp0 = Vec.getOperand(0);
48933 SDValue VecOp1 = Vec.getOperand(1);
48934 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48935 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48936 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48937 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48938 SDLoc DL(EFLAGS);
48939 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48940 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48941 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48942 if (!SignExt0) {
48943 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48944 DAG.getConstant(0xAAAA, DL, MVT::i16));
48945 }
48946 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48947 DAG.getConstant(0, DL, MVT::i16));
48948 }
48949 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48950 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48951 if (CmpBits >= 16 && Subtarget.hasInt256() &&
48952 (IsAnyOf || (SignExt0 && SignExt1))) {
48953 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48954 SDLoc DL(EFLAGS);
48955 SDValue Result = peekThroughBitcasts(Src);
48956 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48957 Result.getValueType().getVectorNumElements() <= NumElts) {
48958 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48959 Result.getOperand(0), Result.getOperand(1));
48960 V = DAG.getBitcast(MVT::v4i64, V);
48961 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48962 }
48963 Result = DAG.getBitcast(MVT::v32i8, Result);
48964 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48965 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
48966 if (!SignExt0 || !SignExt1) {
48967 assert(IsAnyOf &&
48968 "Only perform v16i16 signmasks for any_of patterns");
48969 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
48970 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48971 }
48972 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48973 DAG.getConstant(CmpMask, DL, MVT::i32));
48974 }
48975 }
48976 }
48977
48978 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
48979 // Since we peek through a bitcast, we need to be careful if the base vector
48980 // type has smaller elements than the MOVMSK type. In that case, even if
48981 // all the elements are demanded by the shuffle mask, only the "high"
48982 // elements which have highbits that align with highbits in the MOVMSK vec
48983 // elements are actually demanded. A simplification of spurious operations
48984 // on the "low" elements take place during other simplifications.
48985 //
48986 // For example:
48987 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
48988 // demanded, because we are swapping around the result can change.
48989 //
48990 // To address this, we check that we can scale the shuffle mask to MOVMSK
48991 // element width (this will ensure "high" elements match). Its slightly overly
48992 // conservative, but fine for an edge case fold.
48993 SmallVector<int, 32> ShuffleMask;
48994 SmallVector<SDValue, 2> ShuffleInputs;
48995 if (NumElts <= CmpBits &&
48996 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
48997 ShuffleMask, DAG) &&
48998 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
48999 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49000 canScaleShuffleElements(ShuffleMask, NumElts)) {
49001 SDLoc DL(EFLAGS);
49002 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49003 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49004 Result =
49005 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49006 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49007 }
49008
49009 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49010 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49011 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49012 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49013 // iff every element is referenced.
49014 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49015 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49016 (NumEltBits == 32 || NumEltBits == 64)) {
49017 SDLoc DL(EFLAGS);
49018 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49019 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49020 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49021 SDValue LHS = Vec;
49022 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49023 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49024 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49025 DAG.getBitcast(FloatVT, LHS),
49026 DAG.getBitcast(FloatVT, RHS));
49027 }
49028
49029 return SDValue();
49030}
49031
49032/// Optimize an EFLAGS definition used according to the condition code \p CC
49033/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49034/// uses of chain values.
49036 SelectionDAG &DAG,
49037 const X86Subtarget &Subtarget) {
49038 if (CC == X86::COND_B)
49039 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49040 return Flags;
49041
49042 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49043 return R;
49044
49045 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49046 return R;
49047
49048 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49049 return R;
49050
49051 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49052 return R;
49053
49054 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49055}
49056
49057/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49060 const X86Subtarget &Subtarget) {
49061 SDLoc DL(N);
49062 EVT VT = N->getValueType(0);
49063 SDValue FalseOp = N->getOperand(0);
49064 SDValue TrueOp = N->getOperand(1);
49065 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49066 SDValue Cond = N->getOperand(3);
49067
49068 // cmov X, X, ?, ? --> X
49069 if (TrueOp == FalseOp)
49070 return TrueOp;
49071
49072 // Try to simplify the EFLAGS and condition code operands.
49073 // We can't always do this as FCMOV only supports a subset of X86 cond.
49074 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49075 if (!(FalseOp.getValueType() == MVT::f80 ||
49076 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49077 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49078 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49079 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49080 Flags};
49081 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49082 }
49083 }
49084
49085 // If this is a select between two integer constants, try to do some
49086 // optimizations. Note that the operands are ordered the opposite of SELECT
49087 // operands.
49088 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49089 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49090 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49091 // larger than FalseC (the false value).
49092 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49094 std::swap(TrueC, FalseC);
49095 std::swap(TrueOp, FalseOp);
49096 }
49097
49098 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49099 // This is efficient for any integer data type (including i8/i16) and
49100 // shift amount.
49101 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49102 Cond = getSETCC(CC, Cond, DL, DAG);
49103
49104 // Zero extend the condition if needed.
49105 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49106
49107 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49108 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49109 DAG.getConstant(ShAmt, DL, MVT::i8));
49110 return Cond;
49111 }
49112
49113 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49114 // for any integer data type, including i8/i16.
49115 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49116 Cond = getSETCC(CC, Cond, DL, DAG);
49117
49118 // Zero extend the condition if needed.
49120 FalseC->getValueType(0), Cond);
49121 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49122 SDValue(FalseC, 0));
49123 return Cond;
49124 }
49125
49126 // Optimize cases that will turn into an LEA instruction. This requires
49127 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49128 if (VT == MVT::i32 || VT == MVT::i64) {
49129 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49130 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49131 "Implicit constant truncation");
49132
49133 bool isFastMultiplier = false;
49134 if (Diff.ult(10)) {
49135 switch (Diff.getZExtValue()) {
49136 default: break;
49137 case 1: // result = add base, cond
49138 case 2: // result = lea base( , cond*2)
49139 case 3: // result = lea base(cond, cond*2)
49140 case 4: // result = lea base( , cond*4)
49141 case 5: // result = lea base(cond, cond*4)
49142 case 8: // result = lea base( , cond*8)
49143 case 9: // result = lea base(cond, cond*8)
49144 isFastMultiplier = true;
49145 break;
49146 }
49147 }
49148
49149 if (isFastMultiplier) {
49150 Cond = getSETCC(CC, Cond, DL ,DAG);
49151 // Zero extend the condition if needed.
49152 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49153 Cond);
49154 // Scale the condition by the difference.
49155 if (Diff != 1)
49156 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49157 DAG.getConstant(Diff, DL, Cond.getValueType()));
49158
49159 // Add the base if non-zero.
49160 if (FalseC->getAPIntValue() != 0)
49161 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49162 SDValue(FalseC, 0));
49163 return Cond;
49164 }
49165 }
49166 }
49167 }
49168
49169 // Handle these cases:
49170 // (select (x != c), e, c) -> select (x != c), e, x),
49171 // (select (x == c), c, e) -> select (x == c), x, e)
49172 // where the c is an integer constant, and the "select" is the combination
49173 // of CMOV and CMP.
49174 //
49175 // The rationale for this change is that the conditional-move from a constant
49176 // needs two instructions, however, conditional-move from a register needs
49177 // only one instruction.
49178 //
49179 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49180 // some instruction-combining opportunities. This opt needs to be
49181 // postponed as late as possible.
49182 //
49183 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49184 // the DCI.xxxx conditions are provided to postpone the optimization as
49185 // late as possible.
49186
49187 ConstantSDNode *CmpAgainst = nullptr;
49188 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49189 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49190 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49191
49192 if (CC == X86::COND_NE &&
49193 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49195 std::swap(TrueOp, FalseOp);
49196 }
49197
49198 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49199 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49200 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49201 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49202 }
49203 }
49204 }
49205
49206 // Transform:
49207 //
49208 // (cmov 1 T (uge T 2))
49209 //
49210 // to:
49211 //
49212 // (adc T 0 (sub T 1))
49213 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49214 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49215 SDValue Cond0 = Cond.getOperand(0);
49216 if (Cond0.getOpcode() == ISD::TRUNCATE)
49217 Cond0 = Cond0.getOperand(0);
49218 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49219 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49220 EVT CondVT = Cond->getValueType(0);
49221 // Subtract 1 and generate a carry.
49222 SDValue NewSub =
49223 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49224 DAG.getConstant(1, DL, CondVT));
49225 SDValue EFLAGS(NewSub.getNode(), 1);
49226 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49227 DAG.getConstant(0, DL, VT), EFLAGS);
49228 }
49229 }
49230
49231 // Fold and/or of setcc's to double CMOV:
49232 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49233 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49234 //
49235 // This combine lets us generate:
49236 // cmovcc1 (jcc1 if we don't have CMOV)
49237 // cmovcc2 (same)
49238 // instead of:
49239 // setcc1
49240 // setcc2
49241 // and/or
49242 // cmovne (jne if we don't have CMOV)
49243 // When we can't use the CMOV instruction, it might increase branch
49244 // mispredicts.
49245 // When we can use CMOV, or when there is no mispredict, this improves
49246 // throughput and reduces register pressure.
49247 //
49248 if (CC == X86::COND_NE) {
49249 SDValue Flags;
49250 X86::CondCode CC0, CC1;
49251 bool isAndSetCC;
49252 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49253 if (isAndSetCC) {
49254 std::swap(FalseOp, TrueOp);
49257 }
49258
49259 SDValue LOps[] = {FalseOp, TrueOp,
49260 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49261 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49262 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49263 Flags};
49264 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49265 return CMOV;
49266 }
49267 }
49268
49269 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49270 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49271 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49272 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49273 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49274 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49275 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49276 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49277 SDValue Add = TrueOp;
49278 SDValue Const = FalseOp;
49279 // Canonicalize the condition code for easier matching and output.
49280 if (CC == X86::COND_E)
49281 std::swap(Add, Const);
49282
49283 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49284 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49285 Add.getResNo() == 0 && Add.hasOneUse() &&
49286 Add.getOperand(1) == Cond.getOperand(0)) {
49287 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49288 Add.getOperand(1));
49289 }
49290
49291 // We might have replaced the constant in the cmov with the LHS of the
49292 // compare. If so change it to the RHS of the compare.
49293 if (Const == Cond.getOperand(0))
49294 Const = Cond.getOperand(1);
49295
49296 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49297 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49298 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49299 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49300 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49301 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49302 // This should constant fold.
49303 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49304 SDValue CMov =
49305 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49306 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49307 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49308 }
49309 }
49310
49311 return SDValue();
49312}
49313
49314/// Different mul shrinking modes.
49316
49318 EVT VT = N->getOperand(0).getValueType();
49319 if (VT.getScalarSizeInBits() != 32)
49320 return false;
49321
49322 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49323 unsigned SignBits[2] = {1, 1};
49324 bool IsPositive[2] = {false, false};
49325 for (unsigned i = 0; i < 2; i++) {
49326 SDValue Opd = N->getOperand(i);
49327
49328 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49329 IsPositive[i] = DAG.SignBitIsZero(Opd);
49330 }
49331
49332 bool AllPositive = IsPositive[0] && IsPositive[1];
49333 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49334 // When ranges are from -128 ~ 127, use MULS8 mode.
49335 if (MinSignBits >= 25)
49336 Mode = ShrinkMode::MULS8;
49337 // When ranges are from 0 ~ 255, use MULU8 mode.
49338 else if (AllPositive && MinSignBits >= 24)
49339 Mode = ShrinkMode::MULU8;
49340 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49341 else if (MinSignBits >= 17)
49342 Mode = ShrinkMode::MULS16;
49343 // When ranges are from 0 ~ 65535, use MULU16 mode.
49344 else if (AllPositive && MinSignBits >= 16)
49345 Mode = ShrinkMode::MULU16;
49346 else
49347 return false;
49348 return true;
49349}
49350
49351/// When the operands of vector mul are extended from smaller size values,
49352/// like i8 and i16, the type of mul may be shrinked to generate more
49353/// efficient code. Two typical patterns are handled:
49354/// Pattern1:
49355/// %2 = sext/zext <N x i8> %1 to <N x i32>
49356/// %4 = sext/zext <N x i8> %3 to <N x i32>
49357// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49358/// %5 = mul <N x i32> %2, %4
49359///
49360/// Pattern2:
49361/// %2 = zext/sext <N x i16> %1 to <N x i32>
49362/// %4 = zext/sext <N x i16> %3 to <N x i32>
49363/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49364/// %5 = mul <N x i32> %2, %4
49365///
49366/// There are four mul shrinking modes:
49367/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49368/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49369/// generate pmullw+sext32 for it (MULS8 mode).
49370/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49371/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49372/// generate pmullw+zext32 for it (MULU8 mode).
49373/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49374/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49375/// generate pmullw+pmulhw for it (MULS16 mode).
49376/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49377/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49378/// generate pmullw+pmulhuw for it (MULU16 mode).
49380 const X86Subtarget &Subtarget) {
49381 // Check for legality
49382 // pmullw/pmulhw are not supported by SSE.
49383 if (!Subtarget.hasSSE2())
49384 return SDValue();
49385
49386 // Check for profitability
49387 // pmulld is supported since SSE41. It is better to use pmulld
49388 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49389 // the expansion.
49390 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49391 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49392 return SDValue();
49393
49394 ShrinkMode Mode;
49395 if (!canReduceVMulWidth(N, DAG, Mode))
49396 return SDValue();
49397
49398 SDValue N0 = N->getOperand(0);
49399 SDValue N1 = N->getOperand(1);
49400 EVT VT = N->getOperand(0).getValueType();
49401 unsigned NumElts = VT.getVectorNumElements();
49402 if ((NumElts % 2) != 0)
49403 return SDValue();
49404
49405 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49406
49407 // Shrink the operands of mul.
49408 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49409 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49410
49411 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49412 // lower part is needed.
49413 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49414 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
49415 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
49417 DL, VT, MulLo);
49418
49419 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49420 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49421 // the higher part is also needed.
49422 SDValue MulHi =
49423 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
49424 ReducedVT, NewN0, NewN1);
49425
49426 // Repack the lower part and higher part result of mul into a wider
49427 // result.
49428 // Generate shuffle functioning as punpcklwd.
49429 SmallVector<int, 16> ShuffleMask(NumElts);
49430 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49431 ShuffleMask[2 * i] = i;
49432 ShuffleMask[2 * i + 1] = i + NumElts;
49433 }
49434 SDValue ResLo =
49435 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49436 ResLo = DAG.getBitcast(ResVT, ResLo);
49437 // Generate shuffle functioning as punpckhwd.
49438 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49439 ShuffleMask[2 * i] = i + NumElts / 2;
49440 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49441 }
49442 SDValue ResHi =
49443 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49444 ResHi = DAG.getBitcast(ResVT, ResHi);
49445 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49446}
49447
49449 EVT VT, const SDLoc &DL) {
49450
49451 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49452 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49453 DAG.getConstant(Mult, DL, VT));
49454 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49455 DAG.getConstant(Shift, DL, MVT::i8));
49456 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49457 N->getOperand(0));
49458 return Result;
49459 };
49460
49461 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49462 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49463 DAG.getConstant(Mul1, DL, VT));
49464 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49465 DAG.getConstant(Mul2, DL, VT));
49466 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49467 N->getOperand(0));
49468 return Result;
49469 };
49470
49471 switch (MulAmt) {
49472 default:
49473 break;
49474 case 11:
49475 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49476 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49477 case 21:
49478 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49479 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49480 case 41:
49481 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49482 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49483 case 22:
49484 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49485 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49486 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49487 case 19:
49488 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49489 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49490 case 37:
49491 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49492 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49493 case 73:
49494 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49495 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49496 case 13:
49497 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49498 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49499 case 23:
49500 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49501 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49502 case 26:
49503 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49504 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49505 case 28:
49506 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49507 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49508 case 29:
49509 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49510 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49511 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49512 }
49513
49514 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49515 // by a single LEA.
49516 // First check if this a sum of two power of 2s because that's easy. Then
49517 // count how many zeros are up to the first bit.
49518 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49519 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49520 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49521 if (ScaleShift >= 1 && ScaleShift < 4) {
49522 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49523 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49524 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49525 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49526 DAG.getConstant(ScaleShift, DL, MVT::i8));
49527 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49528 }
49529 }
49530
49531 return SDValue();
49532}
49533
49534// If the upper 17 bits of either element are zero and the other element are
49535// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49536// PMULLD, except on KNL.
49538 SelectionDAG &DAG,
49539 const X86Subtarget &Subtarget) {
49540 if (!Subtarget.hasSSE2())
49541 return SDValue();
49542
49543 if (Subtarget.isPMADDWDSlow())
49544 return SDValue();
49545
49546 EVT VT = N->getValueType(0);
49547
49548 // Only support vXi32 vectors.
49549 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49550 return SDValue();
49551
49552 // Make sure the type is legal or can split/widen to a legal type.
49553 // With AVX512 but without BWI, we would need to split v32i16.
49554 unsigned NumElts = VT.getVectorNumElements();
49555 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49556 return SDValue();
49557
49558 // With AVX512 but without BWI, we would need to split v32i16.
49559 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49560 return SDValue();
49561
49562 SDValue N0 = N->getOperand(0);
49563 SDValue N1 = N->getOperand(1);
49564
49565 // If we are zero/sign extending two steps without SSE4.1, its better to
49566 // reduce the vmul width instead.
49567 if (!Subtarget.hasSSE41() &&
49568 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49569 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49570 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49571 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49572 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49573 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49574 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49575 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49576 return SDValue();
49577
49578 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49579 // the vmul width instead.
49580 if (!Subtarget.hasSSE41() &&
49581 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49582 N0.getOperand(0).getValueSizeInBits() > 128) &&
49583 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49584 N1.getOperand(0).getValueSizeInBits() > 128))
49585 return SDValue();
49586
49587 // Sign bits must extend down to the lowest i16.
49588 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49589 DAG.ComputeMaxSignificantBits(N0) > 16)
49590 return SDValue();
49591
49592 // At least one of the elements must be zero in the upper 17 bits, or can be
49593 // safely made zero without altering the final result.
49594 auto GetZeroableOp = [&](SDValue Op) {
49595 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49596 if (DAG.MaskedValueIsZero(Op, Mask17))
49597 return Op;
49598 // Mask off upper 16-bits of sign-extended constants.
49600 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49601 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49602 SDValue Src = Op.getOperand(0);
49603 // Convert sext(vXi16) to zext(vXi16).
49604 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49605 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49606 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49607 // which will expand the extension.
49608 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49609 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49610 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49611 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49612 }
49613 }
49614 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49615 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49616 N->isOnlyUserOf(Op.getNode())) {
49617 SDValue Src = Op.getOperand(0);
49618 if (Src.getScalarValueSizeInBits() == 16)
49619 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49620 }
49621 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49622 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49623 N->isOnlyUserOf(Op.getNode())) {
49624 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49625 Op.getOperand(1));
49626 }
49627 return SDValue();
49628 };
49629 SDValue ZeroN0 = GetZeroableOp(N0);
49630 SDValue ZeroN1 = GetZeroableOp(N1);
49631 if (!ZeroN0 && !ZeroN1)
49632 return SDValue();
49633 N0 = ZeroN0 ? ZeroN0 : N0;
49634 N1 = ZeroN1 ? ZeroN1 : N1;
49635
49636 // Use SplitOpsAndApply to handle AVX splitting.
49637 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49638 ArrayRef<SDValue> Ops) {
49639 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49640 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49641 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49642 DAG.getBitcast(OpVT, Ops[0]),
49643 DAG.getBitcast(OpVT, Ops[1]));
49644 };
49645 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49646}
49647
49649 const X86Subtarget &Subtarget) {
49650 if (!Subtarget.hasSSE2())
49651 return SDValue();
49652
49653 EVT VT = N->getValueType(0);
49654
49655 // Only support vXi64 vectors.
49656 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49657 VT.getVectorNumElements() < 2 ||
49659 return SDValue();
49660
49661 SDValue N0 = N->getOperand(0);
49662 SDValue N1 = N->getOperand(1);
49663
49664 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49665 // 32-bits. We can lower with this if the sign bits stretch that far.
49666 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49667 DAG.ComputeNumSignBits(N1) > 32) {
49668 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49669 ArrayRef<SDValue> Ops) {
49670 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49671 };
49672 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49673 /*CheckBWI*/ false);
49674 }
49675
49676 // If the upper bits are zero we can use a single pmuludq.
49677 APInt Mask = APInt::getHighBitsSet(64, 32);
49678 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49679 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49680 ArrayRef<SDValue> Ops) {
49681 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49682 };
49683 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49684 /*CheckBWI*/ false);
49685 }
49686
49687 return SDValue();
49688}
49689
49692 const X86Subtarget &Subtarget) {
49693 EVT VT = N->getValueType(0);
49694 SDLoc DL(N);
49695
49696 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49697 return V;
49698
49699 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49700 return V;
49701
49702 if (DCI.isBeforeLegalize() && VT.isVector())
49703 return reduceVMULWidth(N, DL, DAG, Subtarget);
49704
49705 if (VT != MVT::i64 && VT != MVT::i32 &&
49706 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49707 return SDValue();
49708
49709 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49710 if (!Known1.isConstant())
49711 return SDValue();
49712
49713 const APInt &C = Known1.getConstant();
49714 if (C.isZero())
49715 return DAG.getConstant(0, DL, VT);
49716
49717 if (C.isAllOnes())
49718 return DAG.getNegative(N->getOperand(0), DL, VT);
49719
49720 if (isPowerOf2_64(C.getZExtValue()))
49721 return SDValue();
49722
49723 // Optimize a single multiply with constant into two operations in order to
49724 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49726 return SDValue();
49727
49728 // An imul is usually smaller than the alternative sequence.
49730 return SDValue();
49731
49732 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49733 return SDValue();
49734
49735 int64_t SignMulAmt = C.getSExtValue();
49736 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49737 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49738
49739 SDValue NewMul = SDValue();
49740 if (VT == MVT::i64 || VT == MVT::i32) {
49741 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49742 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49743 DAG.getConstant(AbsMulAmt, DL, VT));
49744 if (SignMulAmt < 0)
49745 NewMul = DAG.getNegative(NewMul, DL, VT);
49746
49747 return NewMul;
49748 }
49749
49750 uint64_t MulAmt1 = 0;
49751 uint64_t MulAmt2 = 0;
49752 if ((AbsMulAmt % 9) == 0) {
49753 MulAmt1 = 9;
49754 MulAmt2 = AbsMulAmt / 9;
49755 } else if ((AbsMulAmt % 5) == 0) {
49756 MulAmt1 = 5;
49757 MulAmt2 = AbsMulAmt / 5;
49758 } else if ((AbsMulAmt % 3) == 0) {
49759 MulAmt1 = 3;
49760 MulAmt2 = AbsMulAmt / 3;
49761 }
49762
49763 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49764 if (MulAmt2 &&
49765 (isPowerOf2_64(MulAmt2) ||
49766 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49767
49768 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49769 N->user_begin()->getOpcode() == ISD::ADD))
49770 // If second multiplifer is pow2, issue it first. We want the multiply
49771 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49772 // use is an add. Only do this for positive multiply amounts since the
49773 // negate would prevent it from being used as an address mode anyway.
49774 std::swap(MulAmt1, MulAmt2);
49775
49776 if (isPowerOf2_64(MulAmt1))
49777 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49778 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49779 else
49780 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49781 DAG.getConstant(MulAmt1, DL, VT));
49782
49783 if (isPowerOf2_64(MulAmt2))
49784 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49785 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49786 else
49787 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49788 DAG.getConstant(MulAmt2, DL, VT));
49789
49790 // Negate the result.
49791 if (SignMulAmt < 0)
49792 NewMul = DAG.getNegative(NewMul, DL, VT);
49793 } else if (!Subtarget.slowLEA())
49794 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49795 }
49796 if (!NewMul) {
49797 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49798 if (isPowerOf2_64(AbsMulAmt - 1)) {
49799 // (mul x, 2^N + 1) => (add (shl x, N), x)
49800 NewMul = DAG.getNode(
49801 ISD::ADD, DL, VT, N->getOperand(0),
49802 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49803 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49804 if (SignMulAmt < 0)
49805 NewMul = DAG.getNegative(NewMul, DL, VT);
49806 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49807 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49808 NewMul =
49809 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49810 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49811 // To negate, reverse the operands of the subtract.
49812 if (SignMulAmt < 0)
49813 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49814 else
49815 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49816 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49817 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49818 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49819 NewMul =
49820 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49821 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49822 NewMul = DAG.getNode(
49823 ISD::ADD, DL, VT, NewMul,
49824 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49825 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49826 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49827 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49828 NewMul =
49829 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49830 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49831 NewMul = DAG.getNode(
49832 ISD::SUB, DL, VT, NewMul,
49833 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49834 } else if (SignMulAmt >= 0 && VT.isVector() &&
49835 Subtarget.fastImmVectorShift()) {
49836 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49837 uint64_t ShiftAmt1;
49838 std::optional<unsigned> Opc;
49839 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49840 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49841 Opc = ISD::ADD;
49842 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49843 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49844 Opc = ISD::SUB;
49845 }
49846
49847 if (Opc) {
49848 SDValue Shift1 =
49849 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49850 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49851 SDValue Shift2 =
49852 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49853 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49854 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49855 }
49856 }
49857 }
49858
49859 return NewMul;
49860}
49861
49862// Try to form a MULHU or MULHS node by looking for
49863// (srl (mul ext, ext), 16)
49864// TODO: This is X86 specific because we want to be able to handle wide types
49865// before type legalization. But we can only do it if the vector will be
49866// legalized via widening/splitting. Type legalization can't handle promotion
49867// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49868// combiner.
49870 const SDLoc &DL,
49871 const X86Subtarget &Subtarget) {
49872 using namespace SDPatternMatch;
49873 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49874 "SRL or SRA node is required here!");
49875
49876 if (!Subtarget.hasSSE2())
49877 return SDValue();
49878
49879 // Input type should be at least vXi32.
49880 EVT VT = N->getValueType(0);
49881 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49882 return SDValue();
49883
49884 // The operation must be a multiply shifted right by 16.
49885 SDValue LHS, RHS;
49886 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49887 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49888 return SDValue();
49889
49890 unsigned ExtOpc = LHS.getOpcode();
49891 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49892 RHS.getOpcode() != ExtOpc)
49893 return SDValue();
49894
49895 // Peek through the extends.
49896 LHS = LHS.getOperand(0);
49897 RHS = RHS.getOperand(0);
49898
49899 // Ensure the input types match.
49900 EVT MulVT = LHS.getValueType();
49901 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49902 return SDValue();
49903
49904 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49905 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49906
49907 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49908 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49909}
49910
49912 const X86Subtarget &Subtarget) {
49913 using namespace llvm::SDPatternMatch;
49914 SDValue N0 = N->getOperand(0);
49915 SDValue N1 = N->getOperand(1);
49916 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
49917 EVT VT = N0.getValueType();
49918 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49919 SDLoc DL(N);
49920
49921 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49922 // with out-of-bounds clamping.
49923 if (N0.getOpcode() == ISD::VSELECT &&
49924 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49925 SDValue Cond = N0.getOperand(0);
49926 SDValue N00 = N0.getOperand(1);
49927 SDValue N01 = N0.getOperand(2);
49928 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49930 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49931 m_SpecificCondCode(ISD::SETULT)))) {
49932 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
49933 }
49934 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49936 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49937 m_SpecificCondCode(ISD::SETUGE)))) {
49938 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
49939 }
49940 }
49941
49942 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49943 // since the result of setcc_c is all zero's or all ones.
49944 if (VT.isInteger() && !VT.isVector() &&
49945 N1C && N0.getOpcode() == ISD::AND &&
49946 N0.getOperand(1).getOpcode() == ISD::Constant) {
49947 SDValue N00 = N0.getOperand(0);
49948 APInt Mask = N0.getConstantOperandAPInt(1);
49949 Mask <<= N1C->getAPIntValue();
49950 bool MaskOK = false;
49951 // We can handle cases concerning bit-widening nodes containing setcc_c if
49952 // we carefully interrogate the mask to make sure we are semantics
49953 // preserving.
49954 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49955 // of the underlying setcc_c operation if the setcc_c was zero extended.
49956 // Consider the following example:
49957 // zext(setcc_c) -> i32 0x0000FFFF
49958 // c1 -> i32 0x0000FFFF
49959 // c2 -> i32 0x00000001
49960 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49961 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
49962 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
49963 MaskOK = true;
49964 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
49966 MaskOK = true;
49967 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
49968 N00.getOpcode() == ISD::ANY_EXTEND) &&
49970 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
49971 }
49972 if (MaskOK && Mask != 0)
49973 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
49974 }
49975
49976 return SDValue();
49977}
49978
49980 const X86Subtarget &Subtarget) {
49981 using namespace llvm::SDPatternMatch;
49982 SDValue N0 = N->getOperand(0);
49983 SDValue N1 = N->getOperand(1);
49984 EVT VT = N0.getValueType();
49985 unsigned Size = VT.getSizeInBits();
49986 SDLoc DL(N);
49987
49988 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
49989 return V;
49990
49991 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
49992 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
49993 SDValue ShrAmtVal;
49994 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
49996 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
49997 }
49998
49999 // fold (SRA (SHL X, ShlConst), SraConst)
50000 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50001 // or (sext_in_reg X)
50002 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50003 // depending on relation between SraConst and ShlConst.
50004 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50005 // us to do the sext_in_reg from corresponding bit.
50006
50007 // sexts in X86 are MOVs. The MOVs have the same code size
50008 // as above SHIFTs (only SHIFT on 1 has lower code size).
50009 // However the MOVs have 2 advantages to a SHIFT:
50010 // 1. MOVs can write to a register that differs from source
50011 // 2. MOVs accept memory operands
50012
50013 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50014 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50016 return SDValue();
50017
50018 SDValue N00 = N0.getOperand(0);
50019 SDValue N01 = N0.getOperand(1);
50020 APInt ShlConst = N01->getAsAPIntVal();
50021 APInt SraConst = N1->getAsAPIntVal();
50022 EVT CVT = N1.getValueType();
50023
50024 if (CVT != N01.getValueType())
50025 return SDValue();
50026 if (SraConst.isNegative())
50027 return SDValue();
50028
50029 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50030 unsigned ShiftSize = SVT.getSizeInBits();
50031 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50032 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50033 continue;
50034 SDValue NN =
50035 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50036 if (SraConst.eq(ShlConst))
50037 return NN;
50038 if (SraConst.ult(ShlConst))
50039 return DAG.getNode(ISD::SHL, DL, VT, NN,
50040 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50041 return DAG.getNode(ISD::SRA, DL, VT, NN,
50042 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50043 }
50044 return SDValue();
50045}
50046
50049 const X86Subtarget &Subtarget) {
50050 using namespace llvm::SDPatternMatch;
50051 SDValue N0 = N->getOperand(0);
50052 SDValue N1 = N->getOperand(1);
50053 EVT VT = N0.getValueType();
50054 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50055 SDLoc DL(N);
50056
50057 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50058 return V;
50059
50060 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50061 // with out-of-bounds clamping.
50062 if (N0.getOpcode() == ISD::VSELECT &&
50063 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50064 SDValue Cond = N0.getOperand(0);
50065 SDValue N00 = N0.getOperand(1);
50066 SDValue N01 = N0.getOperand(2);
50067 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50069 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50070 m_SpecificCondCode(ISD::SETULT)))) {
50071 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50072 }
50073 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50075 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50076 m_SpecificCondCode(ISD::SETUGE)))) {
50077 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50078 }
50079 }
50080
50081 // Only do this on the last DAG combine as it can interfere with other
50082 // combines.
50083 if (!DCI.isAfterLegalizeDAG())
50084 return SDValue();
50085
50086 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50087 // TODO: This is a generic DAG combine that became an x86-only combine to
50088 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50089 // and-not ('andn').
50090 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50091 return SDValue();
50092
50093 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50094 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50095 if (!ShiftC || !AndC)
50096 return SDValue();
50097
50098 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50099 // transform should reduce code size. It may also enable secondary transforms
50100 // from improved known-bits analysis or instruction selection.
50101 APInt MaskVal = AndC->getAPIntValue();
50102
50103 // If this can be matched by a zero extend, don't optimize.
50104 if (MaskVal.isMask()) {
50105 unsigned TO = MaskVal.countr_one();
50106 if (TO >= 8 && isPowerOf2_32(TO))
50107 return SDValue();
50108 }
50109
50110 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50111 unsigned OldMaskSize = MaskVal.getSignificantBits();
50112 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50113 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50114 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50115 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50116 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50117 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50118 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50119 }
50120 return SDValue();
50121}
50122
50124 const X86Subtarget &Subtarget) {
50125 unsigned Opcode = N->getOpcode();
50126 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50127
50128 SDLoc DL(N);
50129 EVT VT = N->getValueType(0);
50130 SDValue N0 = N->getOperand(0);
50131 SDValue N1 = N->getOperand(1);
50132 EVT SrcVT = N0.getValueType();
50133
50134 SDValue BC0 =
50135 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50136 SDValue BC1 =
50137 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50138
50139 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50140 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50141 // truncation trees that help us avoid lane crossing shuffles.
50142 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50143 // TODO: We don't handle vXf64 shuffles yet.
50144 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50145 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50147 SmallVector<int> ShuffleMask, ScaledMask;
50148 SDValue Vec = peekThroughBitcasts(BCSrc);
50149 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50151 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50152 // shuffle to a v4X64 width - we can probably relax this in the future.
50153 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50154 ShuffleOps[0].getValueType().is256BitVector() &&
50155 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50156 SDValue Lo, Hi;
50157 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50158 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50159 Lo = DAG.getBitcast(SrcVT, Lo);
50160 Hi = DAG.getBitcast(SrcVT, Hi);
50161 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50162 Res = DAG.getBitcast(ShufVT, Res);
50163 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50164 return DAG.getBitcast(VT, Res);
50165 }
50166 }
50167 }
50168 }
50169
50170 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50171 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50172 // If either/both ops are a shuffle that can scale to v2x64,
50173 // then see if we can perform this as a v4x32 post shuffle.
50174 SmallVector<SDValue> Ops0, Ops1;
50175 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50176 bool IsShuf0 =
50177 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50178 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50179 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50180 bool IsShuf1 =
50181 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50182 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50183 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50184 if (IsShuf0 || IsShuf1) {
50185 if (!IsShuf0) {
50186 Ops0.assign({BC0});
50187 ScaledMask0.assign({0, 1});
50188 }
50189 if (!IsShuf1) {
50190 Ops1.assign({BC1});
50191 ScaledMask1.assign({0, 1});
50192 }
50193
50194 SDValue LHS, RHS;
50195 int PostShuffle[4] = {-1, -1, -1, -1};
50196 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50197 if (M < 0)
50198 return true;
50199 Idx = M % 2;
50200 SDValue Src = Ops[M / 2];
50201 if (!LHS || LHS == Src) {
50202 LHS = Src;
50203 return true;
50204 }
50205 if (!RHS || RHS == Src) {
50206 Idx += 2;
50207 RHS = Src;
50208 return true;
50209 }
50210 return false;
50211 };
50212 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50213 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50214 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50215 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50216 LHS = DAG.getBitcast(SrcVT, LHS);
50217 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50218 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50219 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50220 Res = DAG.getBitcast(ShufVT, Res);
50221 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50222 return DAG.getBitcast(VT, Res);
50223 }
50224 }
50225 }
50226
50227 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50228 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50229 SmallVector<int> Mask0, Mask1;
50230 SmallVector<SDValue> Ops0, Ops1;
50231 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50232 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50233 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50234 !Ops0.empty() && !Ops1.empty() &&
50235 all_of(Ops0,
50236 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50237 all_of(Ops1,
50238 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50239 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50240 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50241 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50242 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50243 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50244 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50245 if ((Op00 == Op11) && (Op01 == Op10)) {
50246 std::swap(Op10, Op11);
50248 }
50249 if ((Op00 == Op10) && (Op01 == Op11)) {
50250 const int Map[4] = {0, 2, 1, 3};
50251 SmallVector<int, 4> ShuffleMask(
50252 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50253 Map[ScaledMask1[1]]});
50254 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50255 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50256 DAG.getBitcast(SrcVT, Op01));
50257 Res = DAG.getBitcast(ShufVT, Res);
50258 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50259 return DAG.getBitcast(VT, Res);
50260 }
50261 }
50262 }
50263
50264 return SDValue();
50265}
50266
50269 const X86Subtarget &Subtarget) {
50270 unsigned Opcode = N->getOpcode();
50271 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50272 "Unexpected pack opcode");
50273
50274 EVT VT = N->getValueType(0);
50275 SDValue N0 = N->getOperand(0);
50276 SDValue N1 = N->getOperand(1);
50277 unsigned NumDstElts = VT.getVectorNumElements();
50278 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50279 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50280 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50281 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50282 "Unexpected PACKSS/PACKUS input type");
50283
50284 bool IsSigned = (X86ISD::PACKSS == Opcode);
50285
50286 // Constant Folding.
50287 APInt UndefElts0, UndefElts1;
50288 SmallVector<APInt, 32> EltBits0, EltBits1;
50289 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50290 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50291 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50292 /*AllowWholeUndefs*/ true,
50293 /*AllowPartialUndefs*/ true) &&
50294 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50295 /*AllowWholeUndefs*/ true,
50296 /*AllowPartialUndefs*/ true)) {
50297 unsigned NumLanes = VT.getSizeInBits() / 128;
50298 unsigned NumSrcElts = NumDstElts / 2;
50299 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50300 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50301
50302 APInt Undefs(NumDstElts, 0);
50303 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50304 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50305 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50306 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50307 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50308 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50309
50310 if (UndefElts[SrcIdx]) {
50311 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50312 continue;
50313 }
50314
50315 APInt &Val = EltBits[SrcIdx];
50316 if (IsSigned) {
50317 // PACKSS: Truncate signed value with signed saturation.
50318 // Source values less than dst minint are saturated to minint.
50319 // Source values greater than dst maxint are saturated to maxint.
50320 Val = Val.truncSSat(DstBitsPerElt);
50321 } else {
50322 // PACKUS: Truncate signed value with unsigned saturation.
50323 // Source values less than zero are saturated to zero.
50324 // Source values greater than dst maxuint are saturated to maxuint.
50325 // NOTE: This is different from APInt::truncUSat.
50326 if (Val.isIntN(DstBitsPerElt))
50327 Val = Val.trunc(DstBitsPerElt);
50328 else if (Val.isNegative())
50329 Val = APInt::getZero(DstBitsPerElt);
50330 else
50331 Val = APInt::getAllOnes(DstBitsPerElt);
50332 }
50333 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50334 }
50335 }
50336
50337 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50338 }
50339
50340 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50341 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50342 return V;
50343
50344 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50345 // Currently limit this to allsignbits cases only.
50346 if (IsSigned &&
50347 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50348 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50349 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50350 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50351 if (Not0 && Not1) {
50352 SDLoc DL(N);
50353 MVT SrcVT = N0.getSimpleValueType();
50354 SDValue Pack =
50355 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50356 DAG.getBitcast(SrcVT, Not1));
50357 return DAG.getNOT(DL, Pack, VT);
50358 }
50359 }
50360
50361 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50362 // truncate to create a larger truncate.
50363 if (Subtarget.hasAVX512() &&
50364 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50365 N0.getOperand(0).getValueType() == MVT::v8i32) {
50366 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50367 (!IsSigned &&
50368 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50369 if (Subtarget.hasVLX())
50370 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50371
50372 // Widen input to v16i32 so we can truncate that.
50373 SDLoc dl(N);
50374 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50375 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50376 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50377 }
50378 }
50379
50380 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50381 if (VT.is128BitVector()) {
50382 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50383 SDValue Src0, Src1;
50384 if (N0.getOpcode() == ExtOpc &&
50386 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50387 Src0 = N0.getOperand(0);
50388 }
50389 if (N1.getOpcode() == ExtOpc &&
50391 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50392 Src1 = N1.getOperand(0);
50393 }
50394 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50395 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50396 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50397 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50398 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50399 }
50400
50401 // Try again with pack(*_extend_vector_inreg, undef).
50402 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50404 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50405 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50406 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50407 DAG);
50408 }
50409
50410 // Attempt to combine as shuffle.
50411 SDValue Op(N, 0);
50412 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50413 return Res;
50414
50415 return SDValue();
50416}
50417
50420 const X86Subtarget &Subtarget) {
50421 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50422 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50423 "Unexpected horizontal add/sub opcode");
50424
50425 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50426 MVT VT = N->getSimpleValueType(0);
50427 SDValue LHS = N->getOperand(0);
50428 SDValue RHS = N->getOperand(1);
50429
50430 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50431 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50432 LHS.getOpcode() == RHS.getOpcode() &&
50433 LHS.getValueType() == RHS.getValueType() &&
50434 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50435 SDValue LHS0 = LHS.getOperand(0);
50436 SDValue LHS1 = LHS.getOperand(1);
50437 SDValue RHS0 = RHS.getOperand(0);
50438 SDValue RHS1 = RHS.getOperand(1);
50439 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50440 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50441 SDLoc DL(N);
50442 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50443 LHS0.isUndef() ? LHS1 : LHS0,
50444 RHS0.isUndef() ? RHS1 : RHS0);
50445 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50446 Res = DAG.getBitcast(ShufVT, Res);
50447 SDValue NewLHS =
50448 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50449 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50450 SDValue NewRHS =
50451 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50452 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50453 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50454 DAG.getBitcast(VT, NewRHS));
50455 }
50456 }
50457 }
50458
50459 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50460 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50461 return V;
50462
50463 return SDValue();
50464}
50465
50468 const X86Subtarget &Subtarget) {
50469 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50470 X86ISD::VSRL == N->getOpcode()) &&
50471 "Unexpected shift opcode");
50472 EVT VT = N->getValueType(0);
50473 SDValue N0 = N->getOperand(0);
50474 SDValue N1 = N->getOperand(1);
50475
50476 // Shift zero -> zero.
50478 return DAG.getConstant(0, SDLoc(N), VT);
50479
50480 // Detect constant shift amounts.
50481 APInt UndefElts;
50482 SmallVector<APInt, 32> EltBits;
50483 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50484 /*AllowWholeUndefs*/ true,
50485 /*AllowPartialUndefs*/ false)) {
50486 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50487 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50488 EltBits[0].getZExtValue(), DAG);
50489 }
50490
50491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50492 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50493 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50494 return SDValue(N, 0);
50495
50496 return SDValue();
50497}
50498
50501 const X86Subtarget &Subtarget) {
50502 unsigned Opcode = N->getOpcode();
50503 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50504 X86ISD::VSRLI == Opcode) &&
50505 "Unexpected shift opcode");
50506 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50507 EVT VT = N->getValueType(0);
50508 SDValue N0 = N->getOperand(0);
50509 SDValue N1 = N->getOperand(1);
50510 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50511 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50512 "Unexpected value type");
50513 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50514
50515 // (shift undef, X) -> 0
50516 if (N0.isUndef())
50517 return DAG.getConstant(0, SDLoc(N), VT);
50518
50519 // Out of range logical bit shifts are guaranteed to be zero.
50520 // Out of range arithmetic bit shifts splat the sign bit.
50521 unsigned ShiftVal = N->getConstantOperandVal(1);
50522 if (ShiftVal >= NumBitsPerElt) {
50523 if (LogicalShift)
50524 return DAG.getConstant(0, SDLoc(N), VT);
50525 ShiftVal = NumBitsPerElt - 1;
50526 }
50527
50528 // (shift X, 0) -> X
50529 if (!ShiftVal)
50530 return N0;
50531
50532 // (shift 0, C) -> 0
50534 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50535 // result are all zeros, not undef.
50536 return DAG.getConstant(0, SDLoc(N), VT);
50537
50538 // (VSRAI -1, C) -> -1
50539 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50540 // N0 is all ones or undef. We guarantee that the bits shifted into the
50541 // result are all ones, not undef.
50542 return DAG.getAllOnesConstant(SDLoc(N), VT);
50543
50544 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50545 unsigned NewShiftVal = Amt0 + Amt1;
50546 if (NewShiftVal >= NumBitsPerElt) {
50547 // Out of range logical bit shifts are guaranteed to be zero.
50548 // Out of range arithmetic bit shifts splat the sign bit.
50549 if (LogicalShift)
50550 return DAG.getConstant(0, SDLoc(N), VT);
50551 NewShiftVal = NumBitsPerElt - 1;
50552 }
50553 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50554 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50555 };
50556
50557 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50558 if (Opcode == N0.getOpcode())
50559 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50560
50561 // (shl (add X, X), C) -> (shl X, (C + 1))
50562 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50563 N0.getOperand(0) == N0.getOperand(1))
50564 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50565
50566 // We can decode 'whole byte' logical bit shifts as shuffles.
50567 if (LogicalShift && (ShiftVal % 8) == 0) {
50568 SDValue Op(N, 0);
50569 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50570 return Res;
50571 }
50572
50573 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50574 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50575 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50576 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50577 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50578 N0.getOpcode() == X86ISD::PSHUFD &&
50579 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50580 N0->hasOneUse()) {
50582 if (BC.getOpcode() == X86ISD::VSHLI &&
50583 BC.getScalarValueSizeInBits() == 64 &&
50584 BC.getConstantOperandVal(1) == 63) {
50585 SDLoc DL(N);
50586 SDValue Src = BC.getOperand(0);
50587 Src = DAG.getBitcast(VT, Src);
50588 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50589 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50590 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50591 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50592 return Src;
50593 }
50594 }
50595
50596 auto TryConstantFold = [&](SDValue V) {
50597 APInt UndefElts;
50598 SmallVector<APInt, 32> EltBits;
50599 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50600 /*AllowWholeUndefs*/ true,
50601 /*AllowPartialUndefs*/ true))
50602 return SDValue();
50603 assert(EltBits.size() == VT.getVectorNumElements() &&
50604 "Unexpected shift value type");
50605 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50606 // created an undef input due to no input bits being demanded, but user
50607 // still expects 0 in other bits.
50608 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50609 APInt &Elt = EltBits[i];
50610 if (UndefElts[i])
50611 Elt = 0;
50612 else if (X86ISD::VSHLI == Opcode)
50613 Elt <<= ShiftVal;
50614 else if (X86ISD::VSRAI == Opcode)
50615 Elt.ashrInPlace(ShiftVal);
50616 else
50617 Elt.lshrInPlace(ShiftVal);
50618 }
50619 // Reset undef elements since they were zeroed above.
50620 UndefElts = 0;
50621 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50622 };
50623
50624 // Constant Folding.
50625 if (N->isOnlyUserOf(N0.getNode())) {
50626 if (SDValue C = TryConstantFold(N0))
50627 return C;
50628
50629 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50630 // Don't break NOT patterns.
50632 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50633 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50635 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50636 SDLoc DL(N);
50637 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50638 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50639 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50640 }
50641 }
50642 }
50643
50644 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50645 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50646 DCI))
50647 return SDValue(N, 0);
50648
50649 return SDValue();
50650}
50651
50654 const X86Subtarget &Subtarget) {
50655 EVT VT = N->getValueType(0);
50656 unsigned Opcode = N->getOpcode();
50657 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50658 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50659 Opcode == ISD::INSERT_VECTOR_ELT) &&
50660 "Unexpected vector insertion");
50661
50662 SDValue Vec = N->getOperand(0);
50663 SDValue Scl = N->getOperand(1);
50664 SDValue Idx = N->getOperand(2);
50665
50666 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50667 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50668 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50669
50670 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50671 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50672 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50673 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50674 APInt::getAllOnes(NumBitsPerElt), DCI))
50675 return SDValue(N, 0);
50676 }
50677
50678 // Attempt to combine insertion patterns to a shuffle.
50679 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50680 SDValue Op(N, 0);
50681 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50682 return Res;
50683 }
50684
50685 return SDValue();
50686}
50687
50688/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50689/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50690/// OR -> CMPNEQSS.
50693 const X86Subtarget &Subtarget) {
50694 unsigned opcode;
50695
50696 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50697 // we're requiring SSE2 for both.
50698 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50699 SDValue N0 = N->getOperand(0);
50700 SDValue N1 = N->getOperand(1);
50701 SDValue CMP0 = N0.getOperand(1);
50702 SDValue CMP1 = N1.getOperand(1);
50703 SDLoc DL(N);
50704
50705 // The SETCCs should both refer to the same CMP.
50706 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50707 return SDValue();
50708
50709 SDValue CMP00 = CMP0->getOperand(0);
50710 SDValue CMP01 = CMP0->getOperand(1);
50711 EVT VT = CMP00.getValueType();
50712
50713 if (VT == MVT::f32 || VT == MVT::f64 ||
50714 (VT == MVT::f16 && Subtarget.hasFP16())) {
50715 bool ExpectingFlags = false;
50716 // Check for any users that want flags:
50717 for (const SDNode *U : N->users()) {
50718 if (ExpectingFlags)
50719 break;
50720
50721 switch (U->getOpcode()) {
50722 default:
50723 case ISD::BR_CC:
50724 case ISD::BRCOND:
50725 case ISD::SELECT:
50726 ExpectingFlags = true;
50727 break;
50728 case ISD::CopyToReg:
50729 case ISD::SIGN_EXTEND:
50730 case ISD::ZERO_EXTEND:
50731 case ISD::ANY_EXTEND:
50732 break;
50733 }
50734 }
50735
50736 if (!ExpectingFlags) {
50737 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50738 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50739
50740 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50741 X86::CondCode tmp = cc0;
50742 cc0 = cc1;
50743 cc1 = tmp;
50744 }
50745
50746 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50747 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50748 // FIXME: need symbolic constants for these magic numbers.
50749 // See X86ATTInstPrinter.cpp:printSSECC().
50750 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50751 if (Subtarget.hasAVX512()) {
50752 SDValue FSetCC =
50753 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50754 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50755 // Need to fill with zeros to ensure the bitcast will produce zeroes
50756 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50757 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50758 DAG.getConstant(0, DL, MVT::v16i1),
50759 FSetCC, DAG.getVectorIdxConstant(0, DL));
50760 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50761 N->getSimpleValueType(0));
50762 }
50763 SDValue OnesOrZeroesF =
50764 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50765 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50766
50767 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50768 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50769
50770 if (is64BitFP && !Subtarget.is64Bit()) {
50771 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50772 // 64-bit integer, since that's not a legal type. Since
50773 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50774 // bits, but can do this little dance to extract the lowest 32 bits
50775 // and work with those going forward.
50776 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50777 MVT::v2f64, OnesOrZeroesF);
50778 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50779 OnesOrZeroesF =
50780 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50781 DAG.getVectorIdxConstant(0, DL));
50782 IntVT = MVT::i32;
50783 }
50784
50785 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50786 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50787 DAG.getConstant(1, DL, IntVT));
50788 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50789 ANDed);
50790 return OneBitOfTruth;
50791 }
50792 }
50793 }
50794 }
50795 return SDValue();
50796}
50797
50798/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50800 SelectionDAG &DAG) {
50801 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50802
50803 MVT VT = N->getSimpleValueType(0);
50804 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50805 return SDValue();
50806
50807 SDValue X, Y;
50808 SDValue N0 = N->getOperand(0);
50809 SDValue N1 = N->getOperand(1);
50810
50811 if (SDValue Not = IsNOT(N0, DAG)) {
50812 X = Not;
50813 Y = N1;
50814 } else if (SDValue Not = IsNOT(N1, DAG)) {
50815 X = Not;
50816 Y = N0;
50817 } else
50818 return SDValue();
50819
50820 X = DAG.getBitcast(VT, X);
50821 Y = DAG.getBitcast(VT, Y);
50822 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50823}
50824
50825/// Try to fold:
50826/// and (vector_shuffle<Z,...,Z>
50827/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50828/// ->
50829/// andnp (vector_shuffle<Z,...,Z>
50830/// (insert_vector_elt undef, X, Z), undef), Y
50832 const X86Subtarget &Subtarget) {
50833 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50834
50835 EVT VT = N->getValueType(0);
50836 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50837 // value and require extra moves.
50838 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50839 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50840 return SDValue();
50841
50842 auto GetNot = [&DAG](SDValue V) {
50843 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
50844 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50845 // end-users are ISD::AND including cases
50846 // (and(extract_vector_element(SVN), Y)).
50847 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50848 !SVN->getOperand(1).isUndef()) {
50849 return SDValue();
50850 }
50851 SDValue IVEN = SVN->getOperand(0);
50852 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50853 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50854 return SDValue();
50855 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50856 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50857 return SDValue();
50858 SDValue Src = IVEN.getOperand(1);
50859 if (SDValue Not = IsNOT(Src, DAG)) {
50860 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50861 SDValue NotIVEN =
50863 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50864 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50865 SVN->getOperand(1), SVN->getMask());
50866 }
50867 return SDValue();
50868 };
50869
50870 SDValue X, Y;
50871 SDValue N0 = N->getOperand(0);
50872 SDValue N1 = N->getOperand(1);
50873 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50874
50875 if (SDValue Not = GetNot(N0)) {
50876 X = Not;
50877 Y = N1;
50878 } else if (SDValue Not = GetNot(N1)) {
50879 X = Not;
50880 Y = N0;
50881 } else
50882 return SDValue();
50883
50884 X = DAG.getBitcast(VT, X);
50885 Y = DAG.getBitcast(VT, Y);
50886 SDLoc DL(N);
50887
50888 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50889 // AVX2.
50890 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50892 SDValue LoX, HiX;
50893 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50894 SDValue LoY, HiY;
50895 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50896 EVT SplitVT = LoX.getValueType();
50897 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50898 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50899 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50900 }
50901
50902 if (TLI.isTypeLegal(VT))
50903 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50904
50905 return SDValue();
50906}
50907
50908// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50909// logical operations, like in the example below.
50910// or (and (truncate x, truncate y)),
50911// (xor (truncate z, build_vector (constants)))
50912// Given a target type \p VT, we generate
50913// or (and x, y), (xor z, zext(build_vector (constants)))
50914// given x, y and z are of type \p VT. We can do so, if operands are either
50915// truncates from VT types, the second operand is a vector of constants or can
50916// be recursively promoted.
50918 SelectionDAG &DAG, unsigned Depth) {
50919 // Limit recursion to avoid excessive compile times.
50921 return SDValue();
50922
50923 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50924 return SDValue();
50925
50926 SDValue N0 = N.getOperand(0);
50927 SDValue N1 = N.getOperand(1);
50928
50929 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50930 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
50931 return SDValue();
50932
50933 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
50934 N0 = NN0;
50935 else {
50936 // The left side has to be a trunc.
50937 if (N0.getOpcode() != ISD::TRUNCATE)
50938 return SDValue();
50939
50940 // The type of the truncated inputs.
50941 if (N0.getOperand(0).getValueType() != VT)
50942 return SDValue();
50943
50944 N0 = N0.getOperand(0);
50945 }
50946
50947 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
50948 N1 = NN1;
50949 else {
50950 // The right side has to be a 'trunc' or a (foldable) constant.
50951 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
50952 N1.getOperand(0).getValueType() == VT;
50953 if (RHSTrunc)
50954 N1 = N1.getOperand(0);
50955 else if (SDValue Cst =
50957 N1 = Cst;
50958 else
50959 return SDValue();
50960 }
50961
50962 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
50963}
50964
50965// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
50966// register. In most cases we actually compare or select YMM-sized registers
50967// and mixing the two types creates horrible code. This method optimizes
50968// some of the transition sequences.
50969// Even with AVX-512 this is still useful for removing casts around logical
50970// operations on vXi1 mask types.
50972 SelectionDAG &DAG,
50973 const X86Subtarget &Subtarget) {
50974 EVT VT = N.getValueType();
50975 assert(VT.isVector() && "Expected vector type");
50976 assert((N.getOpcode() == ISD::ANY_EXTEND ||
50977 N.getOpcode() == ISD::ZERO_EXTEND ||
50978 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
50979
50980 SDValue Narrow = N.getOperand(0);
50981 EVT NarrowVT = Narrow.getValueType();
50982
50983 // Generate the wide operation.
50984 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
50985 if (!Op)
50986 return SDValue();
50987 switch (N.getOpcode()) {
50988 default: llvm_unreachable("Unexpected opcode");
50989 case ISD::ANY_EXTEND:
50990 return Op;
50991 case ISD::ZERO_EXTEND:
50992 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
50993 case ISD::SIGN_EXTEND:
50994 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
50995 Op, DAG.getValueType(NarrowVT));
50996 }
50997}
50998
50999static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51000 unsigned FPOpcode;
51001 switch (Opcode) {
51002 // clang-format off
51003 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51004 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51005 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51006 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51007 // clang-format on
51008 }
51009 return FPOpcode;
51010}
51011
51012/// If both input operands of a logic op are being cast from floating-point
51013/// types or FP compares, try to convert this into a floating-point logic node
51014/// to avoid unnecessary moves from SSE to integer registers.
51015static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51016 SDValue N0, SDValue N1,
51017 SelectionDAG &DAG,
51019 const X86Subtarget &Subtarget) {
51020 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51021 "Unexpected bit opcode");
51022
51023 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51024 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51025 return SDValue();
51026
51027 SDValue N00 = N0.getOperand(0);
51028 SDValue N10 = N1.getOperand(0);
51029 EVT N00Type = N00.getValueType();
51030 EVT N10Type = N10.getValueType();
51031
51032 // Ensure that both types are the same and are legal scalar fp types.
51033 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51034 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51035 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51036 return SDValue();
51037
51038 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51039 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51040 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51041 return DAG.getBitcast(VT, FPLogic);
51042 }
51043
51044 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51045 !N1.hasOneUse())
51046 return SDValue();
51047
51048 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51049 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51050
51051 // The vector ISA for FP predicates is incomplete before AVX, so converting
51052 // COMIS* to CMPS* may not be a win before AVX.
51053 if (!Subtarget.hasAVX() &&
51054 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51055 return SDValue();
51056
51057 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51058 // and vector logic:
51059 // logic (setcc N00, N01), (setcc N10, N11) -->
51060 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51061 unsigned NumElts = 128 / N00Type.getSizeInBits();
51062 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51063 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51064 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51065 SDValue N01 = N0.getOperand(1);
51066 SDValue N11 = N1.getOperand(1);
51067 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51068 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51069 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51070 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51071 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51072 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51073 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51074 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51075}
51076
51077// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51078// to reduce XMM->GPR traffic.
51079static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51080 SDValue N1, SelectionDAG &DAG) {
51081 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51082 "Unexpected bit opcode");
51083
51084 // Both operands must be single use MOVMSK.
51085 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51086 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51087 return SDValue();
51088
51089 SDValue Vec0 = N0.getOperand(0);
51090 SDValue Vec1 = N1.getOperand(0);
51091 EVT VecVT0 = Vec0.getValueType();
51092 EVT VecVT1 = Vec1.getValueType();
51093
51094 // Both MOVMSK operands must be from vectors of the same size and same element
51095 // size, but its OK for a fp/int diff.
51096 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51097 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51098 return SDValue();
51099
51100 unsigned VecOpc =
51102 SDValue Result =
51103 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51104 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51105}
51106
51107// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51108// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51109// handles in InstCombine.
51110static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51111 SDValue N0, SDValue N1,
51112 SelectionDAG &DAG) {
51113 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51114 "Unexpected bit opcode");
51115
51116 // Both operands must be single use.
51117 if (!N0.hasOneUse() || !N1.hasOneUse())
51118 return SDValue();
51119
51120 // Search for matching shifts.
51123
51124 unsigned BCOpc = BC0.getOpcode();
51125 EVT BCVT = BC0.getValueType();
51126 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51127 return SDValue();
51128
51129 switch (BCOpc) {
51130 case X86ISD::VSHLI:
51131 case X86ISD::VSRLI:
51132 case X86ISD::VSRAI: {
51133 if (BC0.getOperand(1) != BC1.getOperand(1))
51134 return SDValue();
51135 SDValue BitOp =
51136 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51137 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51138 return DAG.getBitcast(VT, Shift);
51139 }
51140 }
51141
51142 return SDValue();
51143}
51144
51145// Attempt to fold:
51146// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51147// TODO: Handle PACKUS handling.
51148static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51149 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51150 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51151 "Unexpected bit opcode");
51152
51153 // Both operands must be single use.
51154 if (!N0.hasOneUse() || !N1.hasOneUse())
51155 return SDValue();
51156
51157 // Search for matching packs.
51160
51161 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51162 return SDValue();
51163
51164 MVT DstVT = N0.getSimpleValueType();
51165 if (DstVT != N1.getSimpleValueType())
51166 return SDValue();
51167
51168 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51169 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51170
51171 // Limit to allsignbits packing.
51172 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51173 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51174 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51175 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51176 return SDValue();
51177
51178 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51179 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51180 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51181}
51182
51183/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51184/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51185/// with a shift-right to eliminate loading the vector constant mask value.
51187 SelectionDAG &DAG,
51188 const X86Subtarget &Subtarget) {
51189 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51190 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51191 EVT VT = Op0.getValueType();
51192 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51193 return SDValue();
51194
51195 // Try to convert an "is positive" signbit masking operation into arithmetic
51196 // shift and "andn". This saves a materialization of a -1 vector constant.
51197 // The "is negative" variant should be handled more generally because it only
51198 // requires "and" rather than "andn":
51199 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51200 //
51201 // This is limited to the original type to avoid producing even more bitcasts.
51202 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51203 // will be profitable.
51204 if (N->getValueType(0) == VT &&
51205 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51206 SDValue X, Y;
51207 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51208 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51209 X = Op1.getOperand(0);
51210 Y = Op0;
51211 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51212 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51213 X = Op0.getOperand(0);
51214 Y = Op1;
51215 }
51216 if (X && Y) {
51217 SDValue Sra =
51219 VT.getScalarSizeInBits() - 1, DAG);
51220 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51221 }
51222 }
51223
51224 APInt SplatVal;
51225 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51226 return SDValue();
51227
51228 // Don't prevent creation of ANDN.
51229 if (isBitwiseNot(Op0))
51230 return SDValue();
51231
51232 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51233 return SDValue();
51234
51235 unsigned EltBitWidth = VT.getScalarSizeInBits();
51236 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51237 return SDValue();
51238
51239 unsigned ShiftVal = SplatVal.countr_one();
51240 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51241 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51242 return DAG.getBitcast(N->getValueType(0), Shift);
51243}
51244
51245// Get the index node from the lowered DAG of a GEP IR instruction with one
51246// indexing dimension.
51248 if (Ld->isIndexed())
51249 return SDValue();
51250
51251 SDValue Base = Ld->getBasePtr();
51252 if (Base.getOpcode() != ISD::ADD)
51253 return SDValue();
51254
51255 SDValue ShiftedIndex = Base.getOperand(0);
51256 if (ShiftedIndex.getOpcode() != ISD::SHL)
51257 return SDValue();
51258
51259 return ShiftedIndex.getOperand(0);
51260}
51261
51262static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51263 return Subtarget.hasBMI2() &&
51264 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51265}
51266
51267/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51268/// This undoes the inverse fold performed in InstCombine
51270 SelectionDAG &DAG) {
51271 using namespace llvm::SDPatternMatch;
51272 MVT VT = N->getSimpleValueType(0);
51273 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51274 return SDValue();
51275
51276 SDValue X, Y, Z;
51277 if (sd_match(N, m_And(m_Value(X),
51278 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51279 // Don't fold if Y or Z are constants to prevent infinite loops.
51282 return DAG.getNode(
51283 ISD::AND, DL, VT, X,
51284 DAG.getNOT(
51285 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51286 }
51287
51288 return SDValue();
51289}
51290
51291// This function recognizes cases where X86 bzhi instruction can replace and
51292// 'and-load' sequence.
51293// In case of loading integer value from an array of constants which is defined
51294// as follows:
51295//
51296// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51297//
51298// then applying a bitwise and on the result with another input.
51299// It's equivalent to performing bzhi (zero high bits) on the input, with the
51300// same index of the load.
51302 const X86Subtarget &Subtarget) {
51303 MVT VT = Node->getSimpleValueType(0);
51304 SDLoc dl(Node);
51305
51306 // Check if subtarget has BZHI instruction for the node's type
51307 if (!hasBZHI(Subtarget, VT))
51308 return SDValue();
51309
51310 // Try matching the pattern for both operands.
51311 for (unsigned i = 0; i < 2; i++) {
51312 // continue if the operand is not a load instruction
51313 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51314 if (!Ld)
51315 continue;
51316 const Value *MemOp = Ld->getMemOperand()->getValue();
51317 if (!MemOp)
51318 continue;
51319 // Get the Node which indexes into the array.
51321 if (!Index)
51322 continue;
51323
51324 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51325 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51326 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51327 Constant *Init = GV->getInitializer();
51328 Type *Ty = Init->getType();
51329 if (!isa<ConstantDataArray>(Init) ||
51330 !Ty->getArrayElementType()->isIntegerTy() ||
51332 VT.getSizeInBits() ||
51333 Ty->getArrayNumElements() >
51335 continue;
51336
51337 // Check if the array's constant elements are suitable to our case.
51338 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51339 bool ConstantsMatch = true;
51340 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51341 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51342 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51343 ConstantsMatch = false;
51344 break;
51345 }
51346 }
51347 if (!ConstantsMatch)
51348 continue;
51349
51350 // Do the transformation (For 32-bit type):
51351 // -> (and (load arr[idx]), inp)
51352 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51353 // that will be replaced with one bzhi instruction.
51354 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51355 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51356
51357 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51358 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51359 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51360
51361 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51362 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51363 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51364 }
51365 }
51366 }
51367 }
51368 return SDValue();
51369}
51370
51371// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51372// Where C is a mask containing the same number of bits as the setcc and
51373// where the setcc will freely 0 upper bits of k-register. We can replace the
51374// undef in the concat with 0s and remove the AND. This mainly helps with
51375// v2i1/v4i1 setcc being casted to scalar.
51377 const X86Subtarget &Subtarget) {
51378 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51379
51380 EVT VT = N->getValueType(0);
51381
51382 // Make sure this is an AND with constant. We will check the value of the
51383 // constant later.
51384 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51385 if (!C1)
51386 return SDValue();
51387
51388 // This is implied by the ConstantSDNode.
51389 assert(!VT.isVector() && "Expected scalar VT!");
51390
51391 SDValue Src = N->getOperand(0);
51392 if (!Src.hasOneUse())
51393 return SDValue();
51394
51395 // (Optionally) peek through any_extend().
51396 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51397 if (!Src.getOperand(0).hasOneUse())
51398 return SDValue();
51399 Src = Src.getOperand(0);
51400 }
51401
51402 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51403 return SDValue();
51404
51405 Src = Src.getOperand(0);
51406 EVT SrcVT = Src.getValueType();
51407
51408 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51409 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51410 !TLI.isTypeLegal(SrcVT))
51411 return SDValue();
51412
51413 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51414 return SDValue();
51415
51416 // We only care about the first subvector of the concat, we expect the
51417 // other subvectors to be ignored due to the AND if we make the change.
51418 SDValue SubVec = Src.getOperand(0);
51419 EVT SubVecVT = SubVec.getValueType();
51420
51421 // The RHS of the AND should be a mask with as many bits as SubVec.
51422 if (!TLI.isTypeLegal(SubVecVT) ||
51423 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51424 return SDValue();
51425
51426 // First subvector should be a setcc with a legal result type or a
51427 // AND containing at least one setcc with a legal result type.
51428 auto IsLegalSetCC = [&](SDValue V) {
51429 if (V.getOpcode() != ISD::SETCC)
51430 return false;
51431 EVT SetccVT = V.getOperand(0).getValueType();
51432 if (!TLI.isTypeLegal(SetccVT) ||
51433 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51434 return false;
51435 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51436 return false;
51437 return true;
51438 };
51439 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51440 (IsLegalSetCC(SubVec.getOperand(0)) ||
51441 IsLegalSetCC(SubVec.getOperand(1))))))
51442 return SDValue();
51443
51444 // We passed all the checks. Rebuild the concat_vectors with zeroes
51445 // and cast it back to VT.
51446 SDLoc dl(N);
51447 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51448 DAG.getConstant(0, dl, SubVecVT));
51449 Ops[0] = SubVec;
51450 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51451 Ops);
51452 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51453 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51454}
51455
51457 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51458 // We don't want to go crazy with the recursion here. This isn't a super
51459 // important optimization.
51460 static constexpr unsigned kMaxDepth = 2;
51461
51462 // Only do this re-ordering if op has one use.
51463 if (!Op.hasOneUse())
51464 return SDValue();
51465
51466 SDLoc DL(Op);
51467 // If we hit another assosiative op, recurse further.
51468 if (Op.getOpcode() == Opc) {
51469 // Done recursing.
51470 if (Depth++ >= kMaxDepth)
51471 return SDValue();
51472
51473 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51474 if (SDValue R =
51475 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51476 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51477 Op.getOperand(1 - OpIdx));
51478
51479 } else if (Op.getOpcode() == ISD::SUB) {
51480 if (Opc == ISD::AND) {
51481 // BLSI: (and x, (sub 0, x))
51482 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51483 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51484 }
51485 // Opc must be ISD::AND or ISD::XOR
51486 // BLSR: (and x, (sub x, 1))
51487 // BLSMSK: (xor x, (sub x, 1))
51488 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51489 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51490
51491 } else if (Op.getOpcode() == ISD::ADD) {
51492 // Opc must be ISD::AND or ISD::XOR
51493 // BLSR: (and x, (add x, -1))
51494 // BLSMSK: (xor x, (add x, -1))
51495 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51496 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51497 }
51498 return SDValue();
51499}
51500
51502 const X86Subtarget &Subtarget) {
51503 EVT VT = N->getValueType(0);
51504 // Make sure this node is a candidate for BMI instructions.
51505 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51506 (VT != MVT::i32 && VT != MVT::i64))
51507 return SDValue();
51508
51509 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51510
51511 // Try and match LHS and RHS.
51512 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51513 if (SDValue OpMatch =
51514 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51515 N->getOperand(1 - OpIdx), 0))
51516 return OpMatch;
51517 return SDValue();
51518}
51519
51520/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51522 SelectionDAG &DAG,
51523 const X86Subtarget &Subtarget) {
51524 using namespace llvm::SDPatternMatch;
51525
51526 EVT VT = And->getValueType(0);
51527 // Make sure this node is a candidate for BMI instructions.
51528 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51529 return SDValue();
51530
51531 SDValue X;
51532 SDValue Y;
51533 if (!sd_match(And, m_And(m_OneUse(m_Xor(m_Value(X),
51535 m_Value(Y))))
51536 return SDValue();
51537
51538 SDValue BLSMSK =
51539 DAG.getNode(ISD::XOR, DL, VT, X,
51540 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51541 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51542 return AndN;
51543}
51544
51546 SelectionDAG &DAG,
51548 const X86Subtarget &ST) {
51549 // cmp(setcc(cc, X), 0)
51550 // brcond ne
51551 // ->
51552 // X
51553 // brcond cc
51554
51555 // sub(setcc(cc, X), 1)
51556 // brcond ne
51557 // ->
51558 // X
51559 // brcond ~cc
51560 //
51561 // if only flag has users
51562
51563 SDValue SetCC = N->getOperand(0);
51564
51565 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51566 return SDValue();
51567
51568 // Check the only user of flag is `brcond ne`.
51569 SDNode *BrCond = *Flag->user_begin();
51570 if (BrCond->getOpcode() != X86ISD::BRCOND)
51571 return SDValue();
51572 unsigned CondNo = 2;
51573 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51575 return SDValue();
51576
51577 SDValue X = SetCC.getOperand(1);
51578 // sub has two results while X only have one. DAG combine assumes the value
51579 // type matches.
51580 if (N->getOpcode() == X86ISD::SUB)
51581 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51582
51583 SDValue CCN = SetCC.getOperand(0);
51584 X86::CondCode CC =
51585 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51587 // Update CC for the consumer of the flag.
51588 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51589 // checking if the second condition evaluates to true. When comparing the
51590 // result with 1, we are checking uf the second condition evaluates to false.
51591 SmallVector<SDValue> Ops(BrCond->op_values());
51592 if (isNullConstant(N->getOperand(1)))
51593 Ops[CondNo] = CCN;
51594 else if (isOneConstant(N->getOperand(1)))
51595 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51596 else
51597 llvm_unreachable("expect constant 0 or 1");
51598
51599 SDValue NewBrCond =
51600 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51601 // Avoid self-assign error b/c CC1 can be `e/ne`.
51602 if (BrCond != NewBrCond.getNode())
51603 DCI.CombineTo(BrCond, NewBrCond);
51604 return X;
51605}
51606
51609 const X86Subtarget &ST) {
51610 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51611 // ->
51612 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51613
51614 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51615 // ->
51616 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51617 //
51618 // where cflags is determined by cc1.
51619
51620 if (!ST.hasCCMP())
51621 return SDValue();
51622
51623 SDValue SetCC0 = N->getOperand(0);
51624 SDValue SetCC1 = N->getOperand(1);
51625 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51626 SetCC1.getOpcode() != X86ISD::SETCC)
51627 return SDValue();
51628
51629 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51630 SDValue Op = V.getOperand(1);
51631 unsigned Opc = Op.getOpcode();
51632 if (Opc == X86ISD::SUB)
51633 return X86ISD::CCMP;
51634 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51635 return X86ISD::CTEST;
51636 return 0U;
51637 };
51638
51639 unsigned NewOpc = 0;
51640
51641 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51642 // appear on the right.
51643 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51644 std::swap(SetCC0, SetCC1);
51645 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51646 return SDValue();
51647 }
51648
51649 X86::CondCode CC0 =
51650 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51651 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51652 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51653 return SDValue();
51654
51655 bool IsOR = N->getOpcode() == ISD::OR;
51656
51657 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51658 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51659 // operator is OR. Similar for CC1.
51660 SDValue SrcCC =
51662 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51663 : SetCC0.getOperand(0);
51664 SDValue CC1N = SetCC1.getOperand(0);
51665 X86::CondCode CC1 =
51666 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51668 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51669 SDLoc DL(N);
51670 SDValue CFlags = DAG.getTargetConstant(
51671 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51672 SDValue Sub = SetCC1.getOperand(1);
51673
51674 // Replace any uses of the old flag produced by SUB/CMP with the new one
51675 // produced by CCMP/CTEST.
51676 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51677 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51678 {Sub.getOperand(0), Sub.getOperand(1),
51679 CFlags, SrcCC, SetCC0.getOperand(1)})
51680 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51681 {Sub.getOperand(0), Sub.getOperand(0),
51682 CFlags, SrcCC, SetCC0.getOperand(1)});
51683
51684 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51685}
51686
51689 const X86Subtarget &Subtarget) {
51690 using namespace SDPatternMatch;
51691
51692 SDValue N0 = N->getOperand(0);
51693 SDValue N1 = N->getOperand(1);
51694 EVT VT = N->getValueType(0);
51695 SDLoc dl(N);
51696 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51697
51698 // If this is SSE1 only convert to FAND to avoid scalarization.
51699 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51700 return DAG.getBitcast(MVT::v4i32,
51701 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51702 DAG.getBitcast(MVT::v4f32, N0),
51703 DAG.getBitcast(MVT::v4f32, N1)));
51704 }
51705
51706 // Use a 32-bit and+zext if upper bits known zero.
51707 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51708 APInt HiMask = APInt::getHighBitsSet(64, 32);
51709 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51710 DAG.MaskedValueIsZero(N0, HiMask)) {
51711 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51712 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51713 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51714 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51715 }
51716 }
51717
51718 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51719 // TODO: Support multiple SrcOps.
51720 if (VT == MVT::i1) {
51722 SmallVector<APInt, 2> SrcPartials;
51723 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51724 SrcOps.size() == 1) {
51725 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51726 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51727 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51728 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51729 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51730 if (Mask) {
51731 assert(SrcPartials[0].getBitWidth() == NumElts &&
51732 "Unexpected partial reduction mask");
51733 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51734 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51735 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51736 }
51737 }
51738 }
51739
51740 // InstCombine converts:
51741 // `(-x << C0) & C1`
51742 // to
51743 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51744 // This saves an IR instruction but on x86 the neg/shift version is preferable
51745 // so undo the transform.
51746
51747 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51748 // TODO: We don't actually need a splat for this, we just need the checks to
51749 // hold for each element.
51750 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51751 /*AllowTruncation*/ false);
51752 ConstantSDNode *N01C =
51753 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51754 /*AllowTruncation*/ false);
51755 if (N1C && N01C) {
51756 const APInt &MulC = N01C->getAPIntValue();
51757 const APInt &AndC = N1C->getAPIntValue();
51758 APInt MulCLowBit = MulC & (-MulC);
51759 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51760 (MulCLowBit + MulC).isPowerOf2()) {
51761 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51762 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51763 assert(MulCLowBitLog != -1 &&
51764 "Isolated lowbit is somehow not a power of 2!");
51765 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51766 DAG.getConstant(MulCLowBitLog, dl, VT));
51767 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51768 }
51769 }
51770 }
51771
51772 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51773 return SetCC;
51774
51775 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51776 return V;
51777
51778 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51779 return R;
51780
51781 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51782 return R;
51783
51784 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51785 return R;
51786
51787 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51788 DAG, DCI, Subtarget))
51789 return FPLogic;
51790
51791 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51792 return R;
51793
51794 if (DCI.isBeforeLegalizeOps())
51795 return SDValue();
51796
51797 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51798 return R;
51799
51800 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51801 return R;
51802
51803 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51804 return ShiftRight;
51805
51806 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51807 return R;
51808
51809 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51810 return R;
51811
51812 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51813 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51814 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51815 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51816 unsigned Opc0 = N0.getOpcode();
51817 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51819 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51820 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51821 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51822 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51823 }
51824 }
51825
51826 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51827 // to make use of predicated selects.
51828 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51829 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51830 SDValue X, Y;
51831 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51832 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51833 sd_match(N, m_And(m_Value(X),
51834 m_OneUse(m_SExt(m_AllOf(
51835 m_Value(Y), m_SpecificVT(CondVT),
51836 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51837 return DAG.getSelect(dl, VT, Y, X,
51838 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51839 }
51840 }
51841
51842 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51843 // avoids slow variable shift (moving shift amount to ECX etc.)
51844 if (isOneConstant(N1) && N0->hasOneUse()) {
51845 SDValue Src = N0;
51846 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51847 Src.getOpcode() == ISD::TRUNCATE) &&
51848 Src.getOperand(0)->hasOneUse())
51849 Src = Src.getOperand(0);
51850 bool ContainsNOT = false;
51851 X86::CondCode X86CC = X86::COND_B;
51852 // Peek through AND(NOT(SRL(X,Y)),1).
51853 if (isBitwiseNot(Src)) {
51854 Src = Src.getOperand(0);
51855 X86CC = X86::COND_AE;
51856 ContainsNOT = true;
51857 }
51858 if (Src.getOpcode() == ISD::SRL &&
51859 !isa<ConstantSDNode>(Src.getOperand(1))) {
51860 SDValue BitNo = Src.getOperand(1);
51861 Src = Src.getOperand(0);
51862 // Peek through AND(SRL(NOT(X),Y),1).
51863 if (isBitwiseNot(Src)) {
51864 Src = Src.getOperand(0);
51865 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51866 ContainsNOT = true;
51867 }
51868 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51869 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51870 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51871 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51872 }
51873 }
51874
51875 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51876 // Attempt to recursively combine a bitmask AND with shuffles.
51877 SDValue Op(N, 0);
51878 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51879 return Res;
51880
51881 // If either operand is a constant mask, then only the elements that aren't
51882 // zero are actually demanded by the other operand.
51883 auto GetDemandedMasks = [&](SDValue Op) {
51884 APInt UndefElts;
51885 SmallVector<APInt> EltBits;
51886 int NumElts = VT.getVectorNumElements();
51887 int EltSizeInBits = VT.getScalarSizeInBits();
51888 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51889 APInt DemandedElts = APInt::getAllOnes(NumElts);
51890 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51891 EltBits)) {
51892 DemandedBits.clearAllBits();
51893 DemandedElts.clearAllBits();
51894 for (int I = 0; I != NumElts; ++I) {
51895 if (UndefElts[I]) {
51896 // We can't assume an undef src element gives an undef dst - the
51897 // other src might be zero.
51898 DemandedBits.setAllBits();
51899 DemandedElts.setBit(I);
51900 } else if (!EltBits[I].isZero()) {
51901 DemandedBits |= EltBits[I];
51902 DemandedElts.setBit(I);
51903 }
51904 }
51905 }
51906 return std::make_pair(DemandedBits, DemandedElts);
51907 };
51908 APInt Bits0, Elts0;
51909 APInt Bits1, Elts1;
51910 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51911 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51912
51913 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51914 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51915 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51916 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51917 if (N->getOpcode() != ISD::DELETED_NODE)
51918 DCI.AddToWorklist(N);
51919 return SDValue(N, 0);
51920 }
51921
51922 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
51923 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
51924 if (NewN0 || NewN1)
51925 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
51926 NewN1 ? NewN1 : N1);
51927 }
51928
51929 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
51930 if ((VT.getScalarSizeInBits() % 8) == 0 &&
51932 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51933 SDValue BitMask = N1;
51934 SDValue SrcVec = N0.getOperand(0);
51935 EVT SrcVecVT = SrcVec.getValueType();
51936
51937 // Check that the constant bitmask masks whole bytes.
51938 APInt UndefElts;
51939 SmallVector<APInt, 64> EltBits;
51940 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51941 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
51942 llvm::all_of(EltBits, [](const APInt &M) {
51943 return M.isZero() || M.isAllOnes();
51944 })) {
51945 unsigned NumElts = SrcVecVT.getVectorNumElements();
51946 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
51947 unsigned Idx = N0.getConstantOperandVal(1);
51948
51949 // Create a root shuffle mask from the byte mask and the extracted index.
51950 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
51951 for (unsigned i = 0; i != Scale; ++i) {
51952 if (UndefElts[i])
51953 continue;
51954 int VecIdx = Scale * Idx + i;
51955 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
51956 }
51957
51959 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
51960 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
51961 /*AllowVariableCrossLaneMask=*/true,
51962 /*AllowVariablePerLaneMask=*/true,
51963 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
51964 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
51965 N0.getOperand(1));
51966 }
51967 }
51968
51969 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
51970 return R;
51971
51972 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
51973 return R;
51974
51975 return SDValue();
51976}
51977
51978// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
51980 SelectionDAG &DAG,
51981 const X86Subtarget &Subtarget) {
51982 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51983
51984 MVT VT = N->getSimpleValueType(0);
51985 unsigned EltSizeInBits = VT.getScalarSizeInBits();
51986 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
51987 return SDValue();
51988
51989 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
51990 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
51991 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
51992 return SDValue();
51993
51994 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
51995 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
51996 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
51997 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
51998 return SDValue();
51999
52000 // Attempt to extract constant byte masks.
52001 APInt UndefElts0, UndefElts1;
52002 SmallVector<APInt, 32> EltBits0, EltBits1;
52003 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52004 /*AllowWholeUndefs*/ false,
52005 /*AllowPartialUndefs*/ false))
52006 return SDValue();
52007 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52008 /*AllowWholeUndefs*/ false,
52009 /*AllowPartialUndefs*/ false))
52010 return SDValue();
52011
52012 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52013 // TODO - add UNDEF elts support.
52014 if (UndefElts0[i] || UndefElts1[i])
52015 return SDValue();
52016 if (EltBits0[i] != ~EltBits1[i])
52017 return SDValue();
52018 }
52019
52020 if (useVPTERNLOG(Subtarget, VT)) {
52021 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52022 // VPTERNLOG is only available as vXi32/64-bit types.
52023 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52024 MVT OpVT =
52025 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52026 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52027 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52028 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52029 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52030 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52031 DAG, Subtarget);
52032 return DAG.getBitcast(VT, Res);
52033 }
52034
52035 SDValue X = N->getOperand(0);
52036 SDValue Y =
52037 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52038 DAG.getBitcast(VT, N1.getOperand(0)));
52039 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52040}
52041
52042// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52043// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52044// Waiting for ANDNP combine allows other combines to happen that prevent
52045// matching.
52046static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52047 using namespace SDPatternMatch;
52048 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52049 m_And(m_Deferred(Mask), m_Value(Y))));
52050}
52051
52052// Try to fold:
52053// (or (and (m, y), (pandn m, x)))
52054// into:
52055// (vselect m, x, y)
52056// As a special case, try to fold:
52057// (or (and (m, (sub 0, x)), (pandn m, x)))
52058// into:
52059// (sub (xor X, M), M)
52061 SelectionDAG &DAG,
52062 const X86Subtarget &Subtarget) {
52063 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52064
52065 EVT VT = N->getValueType(0);
52066 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52067 (VT.is256BitVector() && Subtarget.hasInt256())))
52068 return SDValue();
52069
52070 SDValue X, Y, Mask;
52071 if (!matchLogicBlend(N, X, Y, Mask))
52072 return SDValue();
52073
52074 // Validate that X, Y, and Mask are bitcasts, and see through them.
52075 Mask = peekThroughBitcasts(Mask);
52078
52079 EVT MaskVT = Mask.getValueType();
52080 unsigned EltBits = MaskVT.getScalarSizeInBits();
52081
52082 // TODO: Attempt to handle floating point cases as well?
52083 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52084 return SDValue();
52085
52086 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52087 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52088 DAG, Subtarget))
52089 return Res;
52090
52091 // PBLENDVB is only available on SSE 4.1.
52092 if (!Subtarget.hasSSE41())
52093 return SDValue();
52094
52095 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52096 if (Subtarget.hasVLX())
52097 return SDValue();
52098
52099 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52100
52101 X = DAG.getBitcast(BlendVT, X);
52102 Y = DAG.getBitcast(BlendVT, Y);
52103 Mask = DAG.getBitcast(BlendVT, Mask);
52104 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52105 return DAG.getBitcast(VT, Mask);
52106}
52107
52108// Helper function for combineOrCmpEqZeroToCtlzSrl
52109// Transforms:
52110// seteq(cmp x, 0)
52111// into:
52112// srl(ctlz x), log2(bitsize(x))
52113// Input pattern is checked by caller.
52115 SDValue Cmp = Op.getOperand(1);
52116 EVT VT = Cmp.getOperand(0).getValueType();
52117 unsigned Log2b = Log2_32(VT.getSizeInBits());
52118 SDLoc dl(Op);
52119 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52120 // The result of the shift is true or false, and on X86, the 32-bit
52121 // encoding of shr and lzcnt is more desirable.
52122 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52123 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52124 DAG.getConstant(Log2b, dl, MVT::i8));
52125 return Scc;
52126}
52127
52128// Try to transform:
52129// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52130// into:
52131// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52132// Will also attempt to match more generic cases, eg:
52133// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52134// Only applies if the target supports the FastLZCNT feature.
52137 const X86Subtarget &Subtarget) {
52138 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52139 return SDValue();
52140
52141 auto isORCandidate = [](SDValue N) {
52142 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52143 };
52144
52145 // Check the zero extend is extending to 32-bit or more. The code generated by
52146 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52147 // instructions to clear the upper bits.
52148 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52149 !isORCandidate(N->getOperand(0)))
52150 return SDValue();
52151
52152 // Check the node matches: setcc(eq, cmp 0)
52153 auto isSetCCCandidate = [](SDValue N) {
52154 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52155 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52156 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52157 isNullConstant(N->getOperand(1).getOperand(1)) &&
52158 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52159 };
52160
52161 SDNode *OR = N->getOperand(0).getNode();
52162 SDValue LHS = OR->getOperand(0);
52163 SDValue RHS = OR->getOperand(1);
52164
52165 // Save nodes matching or(or, setcc(eq, cmp 0)).
52167 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52168 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52169 ORNodes.push_back(OR);
52170 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52171 LHS = OR->getOperand(0);
52172 RHS = OR->getOperand(1);
52173 }
52174
52175 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52176 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52177 !isORCandidate(SDValue(OR, 0)))
52178 return SDValue();
52179
52180 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52181 // to
52182 // or(srl(ctlz),srl(ctlz)).
52183 // The dag combiner can then fold it into:
52184 // srl(or(ctlz, ctlz)).
52185 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52186 SDValue Ret, NewRHS;
52187 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52188 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52189
52190 if (!Ret)
52191 return SDValue();
52192
52193 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52194 while (!ORNodes.empty()) {
52195 OR = ORNodes.pop_back_val();
52196 LHS = OR->getOperand(0);
52197 RHS = OR->getOperand(1);
52198 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52199 if (RHS->getOpcode() == ISD::OR)
52200 std::swap(LHS, RHS);
52201 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52202 if (!NewRHS)
52203 return SDValue();
52204 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52205 }
52206
52207 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52208}
52209
52210/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52211/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52212/// with CMP+{ADC, SBB}.
52213/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52214static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52215 SDValue X, SDValue Y,
52216 SelectionDAG &DAG,
52217 bool ZeroSecondOpOnly = false) {
52218 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52219 return SDValue();
52220
52221 // Look through a one-use zext.
52222 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52223 Y = Y.getOperand(0);
52224
52225 X86::CondCode CC;
52226 SDValue EFLAGS;
52227 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52228 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52229 EFLAGS = Y.getOperand(1);
52230 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52231 Y.hasOneUse()) {
52232 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52233 }
52234
52235 if (!EFLAGS)
52236 return SDValue();
52237
52238 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52239 // the general case below.
52240 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52241 if (ConstantX && !ZeroSecondOpOnly) {
52242 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52243 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52244 // This is a complicated way to get -1 or 0 from the carry flag:
52245 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52246 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52247 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52248 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52249 EFLAGS);
52250 }
52251
52252 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52253 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52254 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52255 EFLAGS.getValueType().isInteger() &&
52256 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52257 // Swap the operands of a SUB, and we have the same pattern as above.
52258 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52259 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52260 SDValue NewSub = DAG.getNode(
52261 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52262 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52263 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52264 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52265 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52266 NewEFLAGS);
52267 }
52268 }
52269 }
52270
52271 if (CC == X86::COND_B) {
52272 // X + SETB Z --> adc X, 0
52273 // X - SETB Z --> sbb X, 0
52274 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52275 DAG.getVTList(VT, MVT::i32), X,
52276 DAG.getConstant(0, DL, VT), EFLAGS);
52277 }
52278
52279 if (ZeroSecondOpOnly)
52280 return SDValue();
52281
52282 if (CC == X86::COND_A) {
52283 // Try to convert COND_A into COND_B in an attempt to facilitate
52284 // materializing "setb reg".
52285 //
52286 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52287 // cannot take an immediate as its first operand.
52288 //
52289 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52290 EFLAGS.getValueType().isInteger() &&
52291 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52292 SDValue NewSub =
52293 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52294 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52295 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52296 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52297 DAG.getVTList(VT, MVT::i32), X,
52298 DAG.getConstant(0, DL, VT), NewEFLAGS);
52299 }
52300 }
52301
52302 if (CC == X86::COND_AE) {
52303 // X + SETAE --> sbb X, -1
52304 // X - SETAE --> adc X, -1
52305 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52306 DAG.getVTList(VT, MVT::i32), X,
52307 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52308 }
52309
52310 if (CC == X86::COND_BE) {
52311 // X + SETBE --> sbb X, -1
52312 // X - SETBE --> adc X, -1
52313 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52314 // materializing "setae reg".
52315 //
52316 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52317 // cannot take an immediate as its first operand.
52318 //
52319 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52320 EFLAGS.getValueType().isInteger() &&
52321 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52322 SDValue NewSub =
52323 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52324 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52325 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52326 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52327 DAG.getVTList(VT, MVT::i32), X,
52328 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52329 }
52330 }
52331
52332 if (CC != X86::COND_E && CC != X86::COND_NE)
52333 return SDValue();
52334
52335 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52336 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52337 !EFLAGS.getOperand(0).getValueType().isInteger())
52338 return SDValue();
52339
52340 SDValue Z = EFLAGS.getOperand(0);
52341 EVT ZVT = Z.getValueType();
52342
52343 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52344 // the general case below.
52345 if (ConstantX) {
52346 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52347 // fake operands:
52348 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52349 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52350 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52351 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52352 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52353 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52354 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52355 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52356 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52357 SDValue(Neg.getNode(), 1));
52358 }
52359
52360 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52361 // with fake operands:
52362 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52363 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52364 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52365 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52366 SDValue One = DAG.getConstant(1, DL, ZVT);
52367 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52368 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52369 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52370 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52371 Cmp1.getValue(1));
52372 }
52373 }
52374
52375 // (cmp Z, 1) sets the carry flag if Z is 0.
52376 SDValue One = DAG.getConstant(1, DL, ZVT);
52377 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52378 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52379
52380 // Add the flags type for ADC/SBB nodes.
52381 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52382
52383 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52384 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52385 if (CC == X86::COND_NE)
52386 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52387 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52388
52389 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52390 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52391 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52392 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52393}
52394
52395/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52396/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52397/// with CMP+{ADC, SBB}.
52399 SelectionDAG &DAG) {
52400 bool IsSub = N->getOpcode() == ISD::SUB;
52401 SDValue X = N->getOperand(0);
52402 SDValue Y = N->getOperand(1);
52403 EVT VT = N->getValueType(0);
52404
52405 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52406 return ADCOrSBB;
52407
52408 // Commute and try again (negate the result for subtracts).
52409 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52410 if (IsSub)
52411 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52412 return ADCOrSBB;
52413 }
52414
52415 return SDValue();
52416}
52417
52418static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52419 SDValue N0, SDValue N1,
52420 SelectionDAG &DAG) {
52421 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52422
52423 // Delegate to combineAddOrSubToADCOrSBB if we have:
52424 //
52425 // (xor/or (zero_extend (setcc)) imm)
52426 //
52427 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52428 // equivalent to a SUB/ADD, respectively.
52429 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52430 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52431 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52432 bool IsSub = Opc == ISD::XOR;
52433 bool N1COdd = N1C->getZExtValue() & 1;
52434 if (IsSub ? N1COdd : !N1COdd)
52435 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52436 return R;
52437 }
52438 }
52439
52440 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52441 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52442 N0.getOperand(0).getOpcode() == ISD::AND &&
52446 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52447 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52448 N0.getOperand(0).getOperand(1));
52449 }
52450
52451 return SDValue();
52452}
52453
52456 const X86Subtarget &Subtarget) {
52457 SDValue N0 = N->getOperand(0);
52458 SDValue N1 = N->getOperand(1);
52459 EVT VT = N->getValueType(0);
52460 SDLoc dl(N);
52461 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52462
52463 // If this is SSE1 only convert to FOR to avoid scalarization.
52464 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52465 return DAG.getBitcast(MVT::v4i32,
52466 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52467 DAG.getBitcast(MVT::v4f32, N0),
52468 DAG.getBitcast(MVT::v4f32, N1)));
52469 }
52470
52471 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52472 // TODO: Support multiple SrcOps.
52473 if (VT == MVT::i1) {
52475 SmallVector<APInt, 2> SrcPartials;
52476 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52477 SrcOps.size() == 1) {
52478 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52479 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52480 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52481 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52482 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52483 if (Mask) {
52484 assert(SrcPartials[0].getBitWidth() == NumElts &&
52485 "Unexpected partial reduction mask");
52486 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52487 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52488 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52489 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52490 }
52491 }
52492 }
52493
52494 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52495 return SetCC;
52496
52497 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52498 return R;
52499
52500 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52501 return R;
52502
52503 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52504 return R;
52505
52506 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52507 DAG, DCI, Subtarget))
52508 return FPLogic;
52509
52510 if (DCI.isBeforeLegalizeOps())
52511 return SDValue();
52512
52513 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52514 return R;
52515
52516 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52517 return R;
52518
52519 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52520 return R;
52521
52522 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52523 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52524 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52525 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52526 uint64_t Val = CN->getZExtValue();
52527 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52528 Val == 8) {
52529 SDValue NotCond;
52530 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52531 N0.getOperand(1).hasOneUse()) {
52534 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52535 } else if (N0.getOpcode() == ISD::SUB &&
52536 isNullConstant(N0.getOperand(0))) {
52537 SDValue Cond = N0.getOperand(1);
52538 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52539 Cond = Cond.getOperand(0);
52540 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52541 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52543 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52544 }
52545 }
52546
52547 if (NotCond) {
52548 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52549 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52550 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52551 return R;
52552 }
52553 }
52554 }
52555 }
52556
52557 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52558 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52559 // iff the upper elements of the non-shifted arg are zero.
52560 // KUNPCK require 16+ bool vector elements.
52561 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52562 unsigned NumElts = VT.getVectorNumElements();
52563 unsigned HalfElts = NumElts / 2;
52564 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52565 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52566 N1.getConstantOperandAPInt(1) == HalfElts &&
52567 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52568 return DAG.getNode(
52569 ISD::CONCAT_VECTORS, dl, VT,
52570 extractSubVector(N0, 0, DAG, dl, HalfElts),
52571 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52572 }
52573 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52574 N0.getConstantOperandAPInt(1) == HalfElts &&
52575 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52576 return DAG.getNode(
52577 ISD::CONCAT_VECTORS, dl, VT,
52578 extractSubVector(N1, 0, DAG, dl, HalfElts),
52579 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52580 }
52581 }
52582
52583 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52584 // Attempt to recursively combine an OR of shuffles.
52585 SDValue Op(N, 0);
52586 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52587 return Res;
52588
52589 // If either operand is a constant mask, then only the elements that aren't
52590 // allones are actually demanded by the other operand.
52591 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52592 APInt UndefElts;
52593 SmallVector<APInt> EltBits;
52594 int NumElts = VT.getVectorNumElements();
52595 int EltSizeInBits = VT.getScalarSizeInBits();
52596 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52597 return false;
52598
52599 APInt DemandedElts = APInt::getZero(NumElts);
52600 for (int I = 0; I != NumElts; ++I)
52601 if (!EltBits[I].isAllOnes())
52602 DemandedElts.setBit(I);
52603
52604 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52605 };
52606 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52607 if (N->getOpcode() != ISD::DELETED_NODE)
52608 DCI.AddToWorklist(N);
52609 return SDValue(N, 0);
52610 }
52611 }
52612
52613 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52614 return R;
52615
52616 return SDValue();
52617}
52618
52619/// Try to turn tests against the signbit in the form of:
52620/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52621/// into:
52622/// SETGT(X, -1)
52624 SelectionDAG &DAG) {
52625 // This is only worth doing if the output type is i8 or i1.
52626 EVT ResultType = N->getValueType(0);
52627 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52628 return SDValue();
52629
52630 SDValue N0 = N->getOperand(0);
52631 SDValue N1 = N->getOperand(1);
52632
52633 // We should be performing an xor against a truncated shift.
52634 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52635 return SDValue();
52636
52637 // Make sure we are performing an xor against one.
52638 if (!isOneConstant(N1))
52639 return SDValue();
52640
52641 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52642 SDValue Shift = N0.getOperand(0);
52643 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52644 return SDValue();
52645
52646 // Make sure we are truncating from one of i16, i32 or i64.
52647 EVT ShiftTy = Shift.getValueType();
52648 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52649 return SDValue();
52650
52651 // Make sure the shift amount extracts the sign bit.
52652 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52653 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52654 return SDValue();
52655
52656 // Create a greater-than comparison against -1.
52657 // N.B. Using SETGE against 0 works but we want a canonical looking
52658 // comparison, using SETGT matches up with what TranslateX86CC.
52659 SDValue ShiftOp = Shift.getOperand(0);
52660 EVT ShiftOpTy = ShiftOp.getValueType();
52661 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52662 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52663 *DAG.getContext(), ResultType);
52664 SDValue Cond =
52665 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52666 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52667 if (SetCCResultType != ResultType)
52668 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52669 return Cond;
52670}
52671
52672/// Turn vector tests of the signbit in the form of:
52673/// xor (sra X, elt_size(X)-1), -1
52674/// into:
52675/// pcmpgt X, -1
52676///
52677/// This should be called before type legalization because the pattern may not
52678/// persist after that.
52680 const X86Subtarget &Subtarget) {
52681 EVT VT = N->getValueType(0);
52682 if (!VT.isSimple())
52683 return SDValue();
52684
52685 switch (VT.getSimpleVT().SimpleTy) {
52686 // clang-format off
52687 default: return SDValue();
52688 case MVT::v16i8:
52689 case MVT::v8i16:
52690 case MVT::v4i32:
52691 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52692 case MVT::v32i8:
52693 case MVT::v16i16:
52694 case MVT::v8i32:
52695 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52696 // clang-format on
52697 }
52698
52699 // There must be a shift right algebraic before the xor, and the xor must be a
52700 // 'not' operation.
52701 SDValue Shift = N->getOperand(0);
52702 SDValue Ones = N->getOperand(1);
52703 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52705 return SDValue();
52706
52707 // The shift should be smearing the sign bit across each vector element.
52708 auto *ShiftAmt =
52709 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52710 if (!ShiftAmt ||
52711 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52712 return SDValue();
52713
52714 // Create a greater-than comparison against -1. We don't use the more obvious
52715 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52716 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52717}
52718
52719/// Detect patterns of truncation with unsigned saturation:
52720///
52721/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52722/// Return the source value x to be truncated or SDValue() if the pattern was
52723/// not matched.
52724///
52725/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52726/// where C1 >= 0 and C2 is unsigned max of destination type.
52727///
52728/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52729/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52730///
52731/// These two patterns are equivalent to:
52732/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52733/// So return the smax(x, C1) value to be truncated or SDValue() if the
52734/// pattern was not matched.
52736 const SDLoc &DL) {
52737 using namespace llvm::SDPatternMatch;
52738 EVT InVT = In.getValueType();
52739
52740 // Saturation with truncation. We truncate from InVT to VT.
52742 "Unexpected types for truncate operation");
52743
52744 APInt C1, C2;
52746
52747 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52748 // the element size of the destination type.
52749 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52750 C2.isMask(VT.getScalarSizeInBits()))
52751 return UMin;
52752
52753 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52754 sd_match(SMin, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52755 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52756 return SMin;
52757
52758 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52759 sd_match(SMax, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52760 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52761 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52762
52763 return SDValue();
52764}
52765
52766/// Detect patterns of truncation with signed saturation:
52767/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52768/// signed_max_of_dest_type)) to dest_type)
52769/// or:
52770/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52771/// signed_min_of_dest_type)) to dest_type).
52772/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52773/// Return the source value to be truncated or SDValue() if the pattern was not
52774/// matched.
52775static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52776 using namespace llvm::SDPatternMatch;
52777 unsigned NumDstBits = VT.getScalarSizeInBits();
52778 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52779 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52780
52781 APInt SignedMax, SignedMin;
52782 if (MatchPackUS) {
52783 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52784 SignedMin = APInt::getZero(NumSrcBits);
52785 } else {
52786 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52787 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52788 }
52789
52790 SDValue SMin, SMax;
52791 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52792 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52793 return SMax;
52794
52795 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52796 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52797 return SMin;
52798
52799 return SDValue();
52800}
52801
52803 SelectionDAG &DAG,
52804 const X86Subtarget &Subtarget) {
52805 if (!Subtarget.hasSSE2() || !VT.isVector())
52806 return SDValue();
52807
52808 EVT SVT = VT.getVectorElementType();
52809 EVT InVT = In.getValueType();
52810 EVT InSVT = InVT.getVectorElementType();
52811
52812 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52813 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52814 // and concatenate at the same time. Then we can use a final vpmovuswb to
52815 // clip to 0-255.
52816 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52817 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52818 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52819 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52820 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52821 DL, DAG, Subtarget);
52822 assert(Mid && "Failed to pack!");
52823 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52824 }
52825 }
52826
52827 // vXi32 truncate instructions are available with AVX512F.
52828 // vXi16 truncate instructions are only available with AVX512BW.
52829 // For 256-bit or smaller vectors, we require VLX.
52830 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52831 // If the result type is 256-bits or larger and we have disable 512-bit
52832 // registers, we should go ahead and use the pack instructions if possible.
52833 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52834 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52835 (InVT.getSizeInBits() > 128) &&
52836 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52837 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52838
52839 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52841 (SVT == MVT::i8 || SVT == MVT::i16) &&
52842 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52843 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52844 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52845 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52846 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52847 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52848 DAG, Subtarget);
52849 assert(Mid && "Failed to pack!");
52851 Subtarget);
52852 assert(V && "Failed to pack!");
52853 return V;
52854 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52855 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52856 Subtarget);
52857 }
52858 if (SDValue SSatVal = detectSSatPattern(In, VT))
52859 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52860 Subtarget);
52861 }
52862
52863 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52864 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52865 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52866 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52867 unsigned TruncOpc = 0;
52868 SDValue SatVal;
52869 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52870 SatVal = SSatVal;
52871 TruncOpc = X86ISD::VTRUNCS;
52872 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52873 SatVal = USatVal;
52874 TruncOpc = X86ISD::VTRUNCUS;
52875 }
52876 if (SatVal) {
52877 unsigned ResElts = VT.getVectorNumElements();
52878 // If the input type is less than 512 bits and we don't have VLX, we need
52879 // to widen to 512 bits.
52880 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52881 unsigned NumConcats = 512 / InVT.getSizeInBits();
52882 ResElts *= NumConcats;
52883 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52884 ConcatOps[0] = SatVal;
52885 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52886 NumConcats * InVT.getVectorNumElements());
52887 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52888 }
52889 // Widen the result if its narrower than 128 bits.
52890 if (ResElts * SVT.getSizeInBits() < 128)
52891 ResElts = 128 / SVT.getSizeInBits();
52892 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52893 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52894 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52895 DAG.getVectorIdxConstant(0, DL));
52896 }
52897 }
52898
52899 return SDValue();
52900}
52901
52903 SelectionDAG &DAG,
52905 const X86Subtarget &Subtarget) {
52906 auto *Ld = cast<LoadSDNode>(N);
52907 EVT RegVT = Ld->getValueType(0);
52908 SDValue Ptr = Ld->getBasePtr();
52909 SDValue Chain = Ld->getChain();
52910 ISD::LoadExtType Ext = Ld->getExtensionType();
52911
52912 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52913 return SDValue();
52914
52915 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52916 return SDValue();
52917
52919 if (!LdC)
52920 return SDValue();
52921
52922 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
52923 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
52924 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
52925 if (Undefs[I])
52926 continue;
52927 if (UserUndefs[I] || Bits[I] != UserBits[I])
52928 return false;
52929 }
52930 return true;
52931 };
52932
52933 // Look through all other loads/broadcasts in the chain for another constant
52934 // pool entry.
52935 for (SDNode *User : Chain->users()) {
52936 auto *UserLd = dyn_cast<MemSDNode>(User);
52937 if (User != N && UserLd &&
52938 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52939 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52941 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
52942 User->getValueSizeInBits(0).getFixedValue() >
52943 RegVT.getFixedSizeInBits()) {
52944 EVT UserVT = User->getValueType(0);
52945 SDValue UserPtr = UserLd->getBasePtr();
52946 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
52947
52948 // See if we are loading a constant that matches in the lower
52949 // bits of a longer constant (but from a different constant pool ptr).
52950 if (UserC && UserPtr != Ptr) {
52951 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52952 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52953 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
52954 APInt Undefs, UserUndefs;
52955 SmallVector<APInt> Bits, UserBits;
52956 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
52957 UserVT.getScalarSizeInBits());
52958 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
52959 Bits) &&
52961 UserUndefs, UserBits)) {
52962 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
52964 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
52965 RegVT.getSizeInBits());
52966 Extract = DAG.getBitcast(RegVT, Extract);
52967 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52968 }
52969 }
52970 }
52971 }
52972 }
52973 }
52974
52975 return SDValue();
52976}
52977
52980 const X86Subtarget &Subtarget) {
52981 auto *Ld = cast<LoadSDNode>(N);
52982 EVT RegVT = Ld->getValueType(0);
52983 EVT MemVT = Ld->getMemoryVT();
52984 SDLoc dl(Ld);
52985 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52986
52987 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
52988 // into two 16-byte operations. Also split non-temporal aligned loads on
52989 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
52990 ISD::LoadExtType Ext = Ld->getExtensionType();
52991 unsigned Fast;
52992 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
52993 Ext == ISD::NON_EXTLOAD &&
52994 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
52995 Ld->getAlign() >= Align(16)) ||
52996 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
52997 *Ld->getMemOperand(), &Fast) &&
52998 !Fast))) {
52999 unsigned NumElems = RegVT.getVectorNumElements();
53000 if (NumElems < 2)
53001 return SDValue();
53002
53003 unsigned HalfOffset = 16;
53004 SDValue Ptr1 = Ld->getBasePtr();
53005 SDValue Ptr2 =
53006 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53007 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53008 NumElems / 2);
53009 SDValue Load1 =
53010 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53011 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53012 SDValue Load2 =
53013 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53014 Ld->getPointerInfo().getWithOffset(HalfOffset),
53015 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53016 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53017 Load1.getValue(1), Load2.getValue(1));
53018
53019 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53020 return DCI.CombineTo(N, NewVec, TF, true);
53021 }
53022
53023 // Bool vector load - attempt to cast to an integer, as we have good
53024 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53025 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53026 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53027 unsigned NumElts = RegVT.getVectorNumElements();
53028 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53029 if (TLI.isTypeLegal(IntVT)) {
53030 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53031 Ld->getPointerInfo(), Ld->getBaseAlign(),
53032 Ld->getMemOperand()->getFlags());
53033 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53034 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53035 }
53036 }
53037
53038 // If we also broadcast this vector to a wider type, then just extract the
53039 // lowest subvector.
53040 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53041 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53042 SDValue Ptr = Ld->getBasePtr();
53043 SDValue Chain = Ld->getChain();
53044 for (SDNode *User : Chain->users()) {
53045 auto *UserLd = dyn_cast<MemSDNode>(User);
53046 if (User != N && UserLd &&
53047 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53048 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53049 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53050 User->hasAnyUseOfValue(0) &&
53051 User->getValueSizeInBits(0).getFixedValue() >
53052 RegVT.getFixedSizeInBits()) {
53054 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53055 RegVT.getSizeInBits());
53056 Extract = DAG.getBitcast(RegVT, Extract);
53057 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53058 }
53059 }
53060 }
53061
53062 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53063 return V;
53064
53065 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53066 unsigned AddrSpace = Ld->getAddressSpace();
53067 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53068 AddrSpace == X86AS::PTR32_UPTR) {
53069 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53070 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53071 SDValue Cast =
53072 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53073 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53074 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53075 Ld->getMemOperand()->getFlags());
53076 }
53077 }
53078
53079 return SDValue();
53080}
53081
53082/// If V is a build vector of boolean constants and exactly one of those
53083/// constants is true, return the operand index of that true element.
53084/// Otherwise, return -1.
53085static int getOneTrueElt(SDValue V) {
53086 // This needs to be a build vector of booleans.
53087 // TODO: Checking for the i1 type matches the IR definition for the mask,
53088 // but the mask check could be loosened to i8 or other types. That might
53089 // also require checking more than 'allOnesValue'; eg, the x86 HW
53090 // instructions only require that the MSB is set for each mask element.
53091 // The ISD::MSTORE comments/definition do not specify how the mask operand
53092 // is formatted.
53093 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53094 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53095 return -1;
53096
53097 int TrueIndex = -1;
53098 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53099 for (unsigned i = 0; i < NumElts; ++i) {
53100 const SDValue &Op = BV->getOperand(i);
53101 if (Op.isUndef())
53102 continue;
53103 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53104 if (!ConstNode)
53105 return -1;
53106 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53107 // If we already found a one, this is too many.
53108 if (TrueIndex >= 0)
53109 return -1;
53110 TrueIndex = i;
53111 }
53112 }
53113 return TrueIndex;
53114}
53115
53116/// Given a masked memory load/store operation, return true if it has one mask
53117/// bit set. If it has one mask bit set, then also return the memory address of
53118/// the scalar element to load/store, the vector index to insert/extract that
53119/// scalar element, and the alignment for the scalar memory access.
53121 SelectionDAG &DAG, SDValue &Addr,
53122 SDValue &Index, Align &Alignment,
53123 unsigned &Offset) {
53124 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53125 if (TrueMaskElt < 0)
53126 return false;
53127
53128 // Get the address of the one scalar element that is specified by the mask
53129 // using the appropriate offset from the base pointer.
53130 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53131 Offset = 0;
53132 Addr = MaskedOp->getBasePtr();
53133 if (TrueMaskElt != 0) {
53134 Offset = TrueMaskElt * EltVT.getStoreSize();
53136 SDLoc(MaskedOp));
53137 }
53138
53139 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53140 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53141 return true;
53142}
53143
53144/// If exactly one element of the mask is set for a non-extending masked load,
53145/// it is a scalar load and vector insert.
53146/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53147/// mask have already been optimized in IR, so we don't bother with those here.
53148static SDValue
53151 const X86Subtarget &Subtarget) {
53152 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53153 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53154 // However, some target hooks may need to be added to know when the transform
53155 // is profitable. Endianness would also have to be considered.
53156
53157 SDValue Addr, VecIndex;
53158 Align Alignment;
53159 unsigned Offset;
53160 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53161 return SDValue();
53162
53163 // Load the one scalar element that is specified by the mask using the
53164 // appropriate offset from the base pointer.
53165 SDLoc DL(ML);
53166 EVT VT = ML->getValueType(0);
53167 EVT EltVT = VT.getVectorElementType();
53168
53169 EVT CastVT = VT;
53170 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53171 EltVT = MVT::f64;
53172 CastVT = VT.changeVectorElementType(EltVT);
53173 }
53174
53175 SDValue Load =
53176 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53177 ML->getPointerInfo().getWithOffset(Offset),
53178 Alignment, ML->getMemOperand()->getFlags());
53179
53180 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53181
53182 // Insert the loaded element into the appropriate place in the vector.
53183 SDValue Insert =
53184 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53185 Insert = DAG.getBitcast(VT, Insert);
53186 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53187}
53188
53189static SDValue
53192 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53193 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53194 return SDValue();
53195
53196 SDLoc DL(ML);
53197 EVT VT = ML->getValueType(0);
53198
53199 // If we are loading the first and last elements of a vector, it is safe and
53200 // always faster to load the whole vector. Replace the masked load with a
53201 // vector load and select.
53202 unsigned NumElts = VT.getVectorNumElements();
53203 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53204 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53205 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53206 if (LoadFirstElt && LoadLastElt) {
53207 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53208 ML->getMemOperand());
53209 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53210 ML->getPassThru());
53211 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53212 }
53213
53214 // Convert a masked load with a constant mask into a masked load and a select.
53215 // This allows the select operation to use a faster kind of select instruction
53216 // (for example, vblendvps -> vblendps).
53217
53218 // Don't try this if the pass-through operand is already undefined. That would
53219 // cause an infinite loop because that's what we're about to create.
53220 if (ML->getPassThru().isUndef())
53221 return SDValue();
53222
53223 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53224 return SDValue();
53225
53226 // The new masked load has an undef pass-through operand. The select uses the
53227 // original pass-through operand.
53228 SDValue NewML = DAG.getMaskedLoad(
53229 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53230 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53231 ML->getAddressingMode(), ML->getExtensionType());
53232 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53233 ML->getPassThru());
53234
53235 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53236}
53237
53240 const X86Subtarget &Subtarget) {
53241 auto *Mld = cast<MaskedLoadSDNode>(N);
53242
53243 // TODO: Expanding load with constant mask may be optimized as well.
53244 if (Mld->isExpandingLoad())
53245 return SDValue();
53246
53247 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53248 if (SDValue ScalarLoad =
53249 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53250 return ScalarLoad;
53251
53252 // TODO: Do some AVX512 subsets benefit from this transform?
53253 if (!Subtarget.hasAVX512())
53254 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53255 return Blend;
53256 }
53257
53258 // If the mask value has been legalized to a non-boolean vector, try to
53259 // simplify ops leading up to it. We only demand the MSB of each lane.
53260 SDValue Mask = Mld->getMask();
53261 if (Mask.getScalarValueSizeInBits() != 1) {
53262 EVT VT = Mld->getValueType(0);
53263 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53265 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53266 if (N->getOpcode() != ISD::DELETED_NODE)
53267 DCI.AddToWorklist(N);
53268 return SDValue(N, 0);
53269 }
53270 if (SDValue NewMask =
53272 return DAG.getMaskedLoad(
53273 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53274 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53275 Mld->getAddressingMode(), Mld->getExtensionType());
53276 }
53277
53278 return SDValue();
53279}
53280
53281/// If exactly one element of the mask is set for a non-truncating masked store,
53282/// it is a vector extract and scalar store.
53283/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53284/// mask have already been optimized in IR, so we don't bother with those here.
53286 SelectionDAG &DAG,
53287 const X86Subtarget &Subtarget) {
53288 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53289 // However, some target hooks may need to be added to know when the transform
53290 // is profitable. Endianness would also have to be considered.
53291
53292 SDValue Addr, VecIndex;
53293 Align Alignment;
53294 unsigned Offset;
53295 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53296 return SDValue();
53297
53298 // Extract the one scalar element that is actually being stored.
53299 SDLoc DL(MS);
53300 SDValue Value = MS->getValue();
53301 EVT VT = Value.getValueType();
53302 EVT EltVT = VT.getVectorElementType();
53303 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53304 EltVT = MVT::f64;
53305 EVT CastVT = VT.changeVectorElementType(EltVT);
53306 Value = DAG.getBitcast(CastVT, Value);
53307 }
53308 SDValue Extract =
53309 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53310
53311 // Store that element at the appropriate offset from the base pointer.
53312 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53314 Alignment, MS->getMemOperand()->getFlags());
53315}
53316
53319 const X86Subtarget &Subtarget) {
53320 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
53321 if (Mst->isCompressingStore())
53322 return SDValue();
53323
53324 EVT VT = Mst->getValue().getValueType();
53325 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53326
53327 if (Mst->isTruncatingStore())
53328 return SDValue();
53329
53330 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53331 return ScalarStore;
53332
53333 // If the mask value has been legalized to a non-boolean vector, try to
53334 // simplify ops leading up to it. We only demand the MSB of each lane.
53335 SDValue Mask = Mst->getMask();
53336 if (Mask.getScalarValueSizeInBits() != 1) {
53338 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53339 if (N->getOpcode() != ISD::DELETED_NODE)
53340 DCI.AddToWorklist(N);
53341 return SDValue(N, 0);
53342 }
53343 if (SDValue NewMask =
53345 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53346 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53347 Mst->getMemoryVT(), Mst->getMemOperand(),
53348 Mst->getAddressingMode());
53349 }
53350
53351 SDValue Value = Mst->getValue();
53352 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53353 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53354 Mst->getMemoryVT())) {
53355 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53356 Mst->getBasePtr(), Mst->getOffset(), Mask,
53357 Mst->getMemoryVT(), Mst->getMemOperand(),
53358 Mst->getAddressingMode(), true);
53359 }
53360
53361 return SDValue();
53362}
53363
53366 const X86Subtarget &Subtarget) {
53367 StoreSDNode *St = cast<StoreSDNode>(N);
53368 EVT StVT = St->getMemoryVT();
53369 SDLoc dl(St);
53370 SDValue StoredVal = St->getValue();
53371 EVT VT = StoredVal.getValueType();
53372 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53373
53374 // Convert a store of vXi1 into a store of iX and a bitcast.
53375 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53376 VT.getVectorElementType() == MVT::i1) {
53377
53379 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53380
53381 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53382 St->getPointerInfo(), St->getBaseAlign(),
53383 St->getMemOperand()->getFlags());
53384 }
53385
53386 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53387 // This will avoid a copy to k-register.
53388 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53389 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53390 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53391 SDValue Val = StoredVal.getOperand(0);
53392 // We must store zeros to the unused bits.
53393 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53394 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53395 St->getPointerInfo(), St->getBaseAlign(),
53396 St->getMemOperand()->getFlags());
53397 }
53398
53399 // Widen v2i1/v4i1 stores to v8i1.
53400 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53401 Subtarget.hasAVX512()) {
53402 unsigned NumConcats = 8 / VT.getVectorNumElements();
53403 // We must store zeros to the unused bits.
53404 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53405 Ops[0] = StoredVal;
53406 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53407 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53408 St->getPointerInfo(), St->getBaseAlign(),
53409 St->getMemOperand()->getFlags());
53410 }
53411
53412 // Turn vXi1 stores of constants into a scalar store.
53413 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53414 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53416 // If its a v64i1 store without 64-bit support, we need two stores.
53417 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53418 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53419 StoredVal->ops().slice(0, 32));
53421 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53422 StoredVal->ops().slice(32, 32));
53424
53425 SDValue Ptr0 = St->getBasePtr();
53426 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53427
53428 SDValue Ch0 =
53429 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53430 St->getBaseAlign(), St->getMemOperand()->getFlags());
53431 SDValue Ch1 = DAG.getStore(
53432 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53433 St->getBaseAlign(), St->getMemOperand()->getFlags());
53434 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53435 }
53436
53437 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53438 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53439 St->getPointerInfo(), St->getBaseAlign(),
53440 St->getMemOperand()->getFlags());
53441 }
53442
53443 // Convert scalar fabs/fneg load-store to integer equivalents.
53444 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53445 (StoredVal.getOpcode() == ISD::FABS ||
53446 StoredVal.getOpcode() == ISD::FNEG) &&
53447 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53448 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53449 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53450 if (TLI.isTypeLegal(IntVT)) {
53452 unsigned SignOp = ISD::XOR;
53453 if (StoredVal.getOpcode() == ISD::FABS) {
53454 SignMask = ~SignMask;
53455 SignOp = ISD::AND;
53456 }
53457 SDValue LogicOp = DAG.getNode(
53458 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53459 DAG.getConstant(SignMask, dl, IntVT));
53460 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53461 St->getPointerInfo(), St->getBaseAlign(),
53462 St->getMemOperand()->getFlags());
53463 }
53464 }
53465
53466 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53467 // Sandy Bridge, perform two 16-byte stores.
53468 unsigned Fast;
53469 if (VT.is256BitVector() && StVT == VT &&
53470 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53471 *St->getMemOperand(), &Fast) &&
53472 !Fast) {
53473 unsigned NumElems = VT.getVectorNumElements();
53474 if (NumElems < 2)
53475 return SDValue();
53476
53477 return splitVectorStore(St, DAG);
53478 }
53479
53480 // Split under-aligned vector non-temporal stores.
53481 if (St->isNonTemporal() && StVT == VT &&
53482 St->getAlign().value() < VT.getStoreSize()) {
53483 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53484 // vectors or the legalizer can scalarize it to use MOVNTI.
53485 if (VT.is256BitVector() || VT.is512BitVector()) {
53486 unsigned NumElems = VT.getVectorNumElements();
53487 if (NumElems < 2)
53488 return SDValue();
53489 return splitVectorStore(St, DAG);
53490 }
53491
53492 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53493 // to use MOVNTI.
53494 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53495 MVT NTVT = Subtarget.hasSSE4A()
53496 ? MVT::v2f64
53497 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53498 return scalarizeVectorStore(St, NTVT, DAG);
53499 }
53500 }
53501
53502 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53503 // supported, but avx512f is by extending to v16i32 and truncating.
53504 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53505 St->getValue().getOpcode() == ISD::TRUNCATE &&
53506 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53507 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53508 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53509 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53510 St->getValue().getOperand(0));
53511 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53512 MVT::v16i8, St->getMemOperand());
53513 }
53514
53515 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53516 if (!St->isTruncatingStore() &&
53517 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53518 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53519 StoredVal.hasOneUse() &&
53520 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53521 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53522 return EmitTruncSStore(IsSigned, St->getChain(),
53523 dl, StoredVal.getOperand(0), St->getBasePtr(),
53524 VT, St->getMemOperand(), DAG);
53525 }
53526
53527 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53528 if (!St->isTruncatingStore()) {
53529 auto IsExtractedElement = [](SDValue V) {
53530 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53531 V = V.getOperand(0);
53532 unsigned Opc = V.getOpcode();
53534 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53535 V.getOperand(0).hasOneUse())
53536 return V.getOperand(0);
53537 return SDValue();
53538 };
53539 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53540 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53541 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53542 SDValue Src = Trunc.getOperand(0);
53543 MVT DstVT = Trunc.getSimpleValueType();
53544 MVT SrcVT = Src.getSimpleValueType();
53545 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53546 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53547 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53548 if (NumTruncBits == VT.getSizeInBits() &&
53549 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53550 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53551 TruncVT, St->getMemOperand());
53552 }
53553 }
53554 }
53555 }
53556
53557 // Optimize trunc store (of multiple scalars) to shuffle and store.
53558 // First, pack all of the elements in one place. Next, store to memory
53559 // in fewer chunks.
53560 if (St->isTruncatingStore() && VT.isVector()) {
53561 if (TLI.isTruncStoreLegal(VT, StVT)) {
53562 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53563 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53564 dl, Val, St->getBasePtr(),
53565 St->getMemoryVT(), St->getMemOperand(), DAG);
53566 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53567 DAG, dl))
53568 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53569 dl, Val, St->getBasePtr(),
53570 St->getMemoryVT(), St->getMemOperand(), DAG);
53571 }
53572
53573 return SDValue();
53574 }
53575
53576 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53577 unsigned AddrSpace = St->getAddressSpace();
53578 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53579 AddrSpace == X86AS::PTR32_UPTR) {
53580 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53581 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53582 SDValue Cast =
53583 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53584 return DAG.getTruncStore(
53585 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53586 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53587 }
53588 }
53589
53590 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53591 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53592 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53593 Subtarget.hasCF() && St->isSimple()) {
53594 SDValue Cmov;
53595 if (StoredVal.getOpcode() == X86ISD::CMOV)
53596 Cmov = StoredVal;
53597 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53598 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53599 Cmov = StoredVal.getOperand(0);
53600 else
53601 return SDValue();
53602
53603 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53604 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53605 return SDValue();
53606
53607 bool InvertCC = false;
53608 SDValue V = SDValue(Ld, 0);
53609 if (V == Cmov.getOperand(1))
53610 InvertCC = true;
53611 else if (V != Cmov.getOperand(0))
53612 return SDValue();
53613
53614 SDVTList Tys = DAG.getVTList(MVT::Other);
53615 SDValue CC = Cmov.getOperand(2);
53616 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53617 if (InvertCC)
53618 CC = DAG.getTargetConstant(
53621 dl, MVT::i8);
53622 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53623 Cmov.getOperand(3)};
53624 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53625 St->getMemOperand());
53626 }
53627
53628 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53629 // the FP state in cases where an emms may be missing.
53630 // A preferable solution to the general problem is to figure out the right
53631 // places to insert EMMS. This qualifies as a quick hack.
53632
53633 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53634 if (VT.getSizeInBits() != 64)
53635 return SDValue();
53636
53637 const Function &F = DAG.getMachineFunction().getFunction();
53638 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53639 bool F64IsLegal =
53640 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53641
53642 if (!F64IsLegal || Subtarget.is64Bit())
53643 return SDValue();
53644
53645 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53646 cast<LoadSDNode>(St->getValue())->isSimple() &&
53647 St->getChain().hasOneUse() && St->isSimple()) {
53648 auto *Ld = cast<LoadSDNode>(St->getValue());
53649
53650 if (!ISD::isNormalLoad(Ld))
53651 return SDValue();
53652
53653 // Avoid the transformation if there are multiple uses of the loaded value.
53654 if (!Ld->hasNUsesOfValue(1, 0))
53655 return SDValue();
53656
53657 SDLoc LdDL(Ld);
53658 SDLoc StDL(N);
53659
53660 // Remove any range metadata as we're converting to f64 load/store.
53661 Ld->getMemOperand()->clearRanges();
53662
53663 // Lower to a single movq load/store pair.
53664 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53665 Ld->getBasePtr(), Ld->getMemOperand());
53666
53667 // Make sure new load is placed in same chain order.
53668 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53669 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53670 St->getMemOperand());
53671 }
53672
53673 // This is similar to the above case, but here we handle a scalar 64-bit
53674 // integer store that is extracted from a vector on a 32-bit target.
53675 // If we have SSE2, then we can treat it like a floating-point double
53676 // to get past legalization. The execution dependencies fixup pass will
53677 // choose the optimal machine instruction for the store if this really is
53678 // an integer or v2f32 rather than an f64.
53679 if (VT == MVT::i64 &&
53681 SDValue OldExtract = St->getOperand(1);
53682 SDValue ExtOp0 = OldExtract.getOperand(0);
53683 unsigned VecSize = ExtOp0.getValueSizeInBits();
53684 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53685 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53686 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53687 BitCast, OldExtract.getOperand(1));
53688 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53689 St->getPointerInfo(), St->getBaseAlign(),
53690 St->getMemOperand()->getFlags());
53691 }
53692
53693 return SDValue();
53694}
53695
53698 const X86Subtarget &Subtarget) {
53699 auto *St = cast<MemIntrinsicSDNode>(N);
53700
53701 SDValue StoredVal = N->getOperand(1);
53702 MVT VT = StoredVal.getSimpleValueType();
53703 EVT MemVT = St->getMemoryVT();
53704
53705 // Figure out which elements we demand.
53706 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53707 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53708
53709 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53710 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53711 if (N->getOpcode() != ISD::DELETED_NODE)
53712 DCI.AddToWorklist(N);
53713 return SDValue(N, 0);
53714 }
53715
53716 return SDValue();
53717}
53718
53719/// Return 'true' if this vector operation is "horizontal"
53720/// and return the operands for the horizontal operation in LHS and RHS. A
53721/// horizontal operation performs the binary operation on successive elements
53722/// of its first operand, then on successive elements of its second operand,
53723/// returning the resulting values in a vector. For example, if
53724/// A = < float a0, float a1, float a2, float a3 >
53725/// and
53726/// B = < float b0, float b1, float b2, float b3 >
53727/// then the result of doing a horizontal operation on A and B is
53728/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53729/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53730/// A horizontal-op B, for some already available A and B, and if so then LHS is
53731/// set to A, RHS to B, and the routine returns 'true'.
53732static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53733 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53734 bool IsCommutative,
53735 SmallVectorImpl<int> &PostShuffleMask,
53736 bool ForceHorizOp) {
53737 // If either operand is undef, bail out. The binop should be simplified.
53738 if (LHS.isUndef() || RHS.isUndef())
53739 return false;
53740
53741 // Look for the following pattern:
53742 // A = < float a0, float a1, float a2, float a3 >
53743 // B = < float b0, float b1, float b2, float b3 >
53744 // and
53745 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53746 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53747 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53748 // which is A horizontal-op B.
53749
53750 MVT VT = LHS.getSimpleValueType();
53751 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53752 "Unsupported vector type for horizontal add/sub");
53753 unsigned NumElts = VT.getVectorNumElements();
53754
53755 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53756 SmallVectorImpl<int> &ShuffleMask) {
53757 bool UseSubVector = false;
53758 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53759 Op.getOperand(0).getValueType().is256BitVector() &&
53760 llvm::isNullConstant(Op.getOperand(1))) {
53761 Op = Op.getOperand(0);
53762 UseSubVector = true;
53763 }
53765 SmallVector<int, 16> SrcMask, ScaledMask;
53767 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53768 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53769 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53770 })) {
53771 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53772 if (!UseSubVector && SrcOps.size() <= 2 &&
53773 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53774 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53775 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53776 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53777 }
53778 if (UseSubVector && SrcOps.size() == 1 &&
53779 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53780 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53781 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53782 ShuffleMask.assign(Mask.begin(), Mask.end());
53783 }
53784 }
53785 };
53786
53787 // View LHS in the form
53788 // LHS = VECTOR_SHUFFLE A, B, LMask
53789 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53790 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53791 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53792 SDValue A, B;
53794 GetShuffle(LHS, A, B, LMask);
53795
53796 // Likewise, view RHS in the form
53797 // RHS = VECTOR_SHUFFLE C, D, RMask
53798 SDValue C, D;
53800 GetShuffle(RHS, C, D, RMask);
53801
53802 // At least one of the operands should be a vector shuffle.
53803 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53804 if (NumShuffles == 0)
53805 return false;
53806
53807 if (LMask.empty()) {
53808 A = LHS;
53809 for (unsigned i = 0; i != NumElts; ++i)
53810 LMask.push_back(i);
53811 }
53812
53813 if (RMask.empty()) {
53814 C = RHS;
53815 for (unsigned i = 0; i != NumElts; ++i)
53816 RMask.push_back(i);
53817 }
53818
53819 // If we have an unary mask, ensure the other op is set to null.
53820 if (isUndefOrInRange(LMask, 0, NumElts))
53821 B = SDValue();
53822 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53823 A = SDValue();
53824
53825 if (isUndefOrInRange(RMask, 0, NumElts))
53826 D = SDValue();
53827 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53828 C = SDValue();
53829
53830 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53831 // RHS operands and shuffle mask.
53832 if (A != C) {
53833 std::swap(C, D);
53835 }
53836 // Check that the shuffles are both shuffling the same vectors.
53837 if (!(A == C && B == D))
53838 return false;
53839
53840 PostShuffleMask.clear();
53841 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53842
53843 // LHS and RHS are now:
53844 // LHS = shuffle A, B, LMask
53845 // RHS = shuffle A, B, RMask
53846 // Check that the masks correspond to performing a horizontal operation.
53847 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53848 // so we just repeat the inner loop if this is a 256-bit op.
53849 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53850 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53851 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53852 assert((NumEltsPer128BitChunk % 2 == 0) &&
53853 "Vector type should have an even number of elements in each lane");
53854 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53855 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53856 // Ignore undefined components.
53857 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53858 if (LIdx < 0 || RIdx < 0 ||
53859 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53860 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53861 continue;
53862
53863 // Check that successive odd/even elements are being operated on. If not,
53864 // this is not a horizontal operation.
53865 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53866 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53867 return false;
53868
53869 // Compute the post-shuffle mask index based on where the element
53870 // is stored in the HOP result, and where it needs to be moved to.
53871 int Base = LIdx & ~1u;
53872 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53873 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53874
53875 // The low half of the 128-bit result must choose from A.
53876 // The high half of the 128-bit result must choose from B,
53877 // unless B is undef. In that case, we are always choosing from A.
53878 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53879 Index += NumEltsPer64BitChunk;
53880 PostShuffleMask[i + j] = Index;
53881 }
53882 }
53883
53884 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53885 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53886
53887 bool IsIdentityPostShuffle =
53888 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53889 if (IsIdentityPostShuffle)
53890 PostShuffleMask.clear();
53891
53892 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53893 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53894 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53895 return false;
53896
53897 // If the source nodes are already used in HorizOps then always accept this.
53898 // Shuffle folding should merge these back together.
53899 auto FoundHorizUser = [&](SDNode *User) {
53900 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53901 };
53902 ForceHorizOp =
53903 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53904 llvm::any_of(NewRHS->users(), FoundHorizUser));
53905
53906 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53907 // shuffle the result.
53908 if (!ForceHorizOp &&
53909 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53910 (NumShuffles < 2 || !IsIdentityPostShuffle),
53911 DAG, Subtarget))
53912 return false;
53913
53914 LHS = DAG.getBitcast(VT, NewLHS);
53915 RHS = DAG.getBitcast(VT, NewRHS);
53916 return true;
53917}
53918
53919// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
53921 const X86Subtarget &Subtarget) {
53922 EVT VT = N->getValueType(0);
53923 unsigned Opcode = N->getOpcode();
53924 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
53925 SmallVector<int, 8> PostShuffleMask;
53926
53927 auto MergableHorizOp = [N](unsigned HorizOpcode) {
53928 return N->hasOneUse() &&
53929 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53930 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53931 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53932 };
53933
53934 switch (Opcode) {
53935 case ISD::FADD:
53936 case ISD::FSUB:
53937 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
53938 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
53939 SDValue LHS = N->getOperand(0);
53940 SDValue RHS = N->getOperand(1);
53941 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
53942 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53943 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53944 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
53945 if (!PostShuffleMask.empty())
53946 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53947 DAG.getUNDEF(VT), PostShuffleMask);
53948 return HorizBinOp;
53949 }
53950 }
53951 break;
53952 case ISD::ADD:
53953 case ISD::SUB:
53954 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
53955 VT == MVT::v16i16 || VT == MVT::v8i32)) {
53956 SDValue LHS = N->getOperand(0);
53957 SDValue RHS = N->getOperand(1);
53958 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
53959 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53960 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53961 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
53962 ArrayRef<SDValue> Ops) {
53963 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
53964 };
53965 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
53966 {LHS, RHS}, HOpBuilder);
53967 if (!PostShuffleMask.empty())
53968 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53969 DAG.getUNDEF(VT), PostShuffleMask);
53970 return HorizBinOp;
53971 }
53972 }
53973 break;
53974 }
53975
53976 return SDValue();
53977}
53978
53979// Try to combine the following nodes
53980// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
53981// <i32 -2147483648[float -0.000000e+00]> 0
53982// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
53983// <(load 4 from constant-pool)> t0, t29
53984// [t30: v16i32 = bitcast t27]
53985// t6: v16i32 = xor t7, t27[t30]
53986// t11: v16f32 = bitcast t6
53987// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
53988// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
53989// t22: v16f32 = bitcast t7
53990// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
53991// t24: v32f16 = bitcast t23
53993 const X86Subtarget &Subtarget) {
53994 EVT VT = N->getValueType(0);
53995 SDValue LHS = N->getOperand(0);
53996 SDValue RHS = N->getOperand(1);
53997 int CombineOpcode =
53998 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
53999 auto combineConjugation = [&](SDValue &r) {
54000 if (LHS->getOpcode() == ISD::BITCAST) {
54001 SDValue XOR = LHS.getOperand(0);
54002 if (XOR->getOpcode() == ISD::XOR) {
54003 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54004 if (XORRHS.isConstant()) {
54005 APInt ConjugationInt32 = APInt(32, 0x80000000);
54006 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54007 if ((XORRHS.getBitWidth() == 32 &&
54008 XORRHS.getConstant() == ConjugationInt32) ||
54009 (XORRHS.getBitWidth() == 64 &&
54010 XORRHS.getConstant() == ConjugationInt64)) {
54011 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54012 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54013 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54014 r = DAG.getBitcast(VT, FCMulC);
54015 return true;
54016 }
54017 }
54018 }
54019 }
54020 return false;
54021 };
54022 SDValue Res;
54023 if (combineConjugation(Res))
54024 return Res;
54025 std::swap(LHS, RHS);
54026 if (combineConjugation(Res))
54027 return Res;
54028 return Res;
54029}
54030
54031// Try to combine the following nodes:
54032// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54034 const X86Subtarget &Subtarget) {
54035 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54037 Flags.hasAllowContract();
54038 };
54039
54040 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54041 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54042 Flags.hasNoSignedZeros();
54043 };
54044 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54045 APInt AI = APInt(32, 0x80008000);
54046 KnownBits Bits = DAG.computeKnownBits(Op);
54047 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54048 Bits.getConstant() == AI;
54049 };
54050
54051 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54052 !AllowContract(N->getFlags()))
54053 return SDValue();
54054
54055 EVT VT = N->getValueType(0);
54056 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54057 return SDValue();
54058
54059 SDValue LHS = N->getOperand(0);
54060 SDValue RHS = N->getOperand(1);
54061 bool IsConj;
54062 SDValue FAddOp1, MulOp0, MulOp1;
54063 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54064 &IsVectorAllNegativeZero,
54065 &HasNoSignedZero](SDValue N) -> bool {
54066 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54067 return false;
54068 SDValue Op0 = N.getOperand(0);
54069 unsigned Opcode = Op0.getOpcode();
54070 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54071 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54072 MulOp0 = Op0.getOperand(0);
54073 MulOp1 = Op0.getOperand(1);
54074 IsConj = Opcode == X86ISD::VFCMULC;
54075 return true;
54076 }
54077 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54079 HasNoSignedZero(Op0->getFlags())) ||
54080 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54081 MulOp0 = Op0.getOperand(0);
54082 MulOp1 = Op0.getOperand(1);
54083 IsConj = Opcode == X86ISD::VFCMADDC;
54084 return true;
54085 }
54086 }
54087 return false;
54088 };
54089
54090 if (GetCFmulFrom(LHS))
54091 FAddOp1 = RHS;
54092 else if (GetCFmulFrom(RHS))
54093 FAddOp1 = LHS;
54094 else
54095 return SDValue();
54096
54097 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54098 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54099 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54100 // FIXME: How do we handle when fast math flags of FADD are different from
54101 // CFMUL's?
54102 SDValue CFmul =
54103 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54104 return DAG.getBitcast(VT, CFmul);
54105}
54106
54107/// Do target-specific dag combines on floating-point adds/subs.
54109 const X86Subtarget &Subtarget) {
54110 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54111 return HOp;
54112
54113 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54114 return COp;
54115
54116 return SDValue();
54117}
54118
54120 const X86Subtarget &Subtarget) {
54121 EVT VT = N->getValueType(0);
54122 SDValue Src = N->getOperand(0);
54123 EVT SrcVT = Src.getValueType();
54124 SDLoc DL(N);
54125
54126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54127
54128 // Let legalize expand this if it isn't a legal type yet.
54129 if (!TLI.isTypeLegal(VT))
54130 return SDValue();
54131
54132 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54133 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54134 return SDValue();
54135
54136 if (SrcVT == MVT::v2f16) {
54137 SrcVT = MVT::v4f16;
54138 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54139 DAG.getUNDEF(MVT::v2f16));
54140 }
54141
54142 if (SrcVT == MVT::v4f16) {
54143 SrcVT = MVT::v8f16;
54144 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54145 DAG.getUNDEF(MVT::v4f16));
54146 } else if (SrcVT == MVT::v2f32) {
54147 SrcVT = MVT::v4f32;
54148 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54149 DAG.getUNDEF(MVT::v2f32));
54150 } else {
54151 return SDValue();
54152 }
54153
54154 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54155}
54156
54157// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54158// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54159// are able to avoid generating code with MOVABS and large constants in certain
54160// cases.
54162 const SDLoc &DL) {
54163 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54164 std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N);
54165 if (!ValidSrlConst)
54166 return SDValue();
54167 uint64_t SrlConstVal = *ValidSrlConst;
54168
54169 SDValue Op = N.getOperand(0);
54170 unsigned Opcode = Op.getOpcode();
54171 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54172 "Illegal truncation types");
54173
54174 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54175 !isa<ConstantSDNode>(Op.getOperand(1)))
54176 return SDValue();
54177 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54178
54179 if (SrlConstVal <= 32 ||
54180 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54181 return SDValue();
54182
54183 SDValue OpLhsSrl =
54184 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54185 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54186
54187 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54188 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54189 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54190
54191 if (Opcode == ISD::ADD) {
54192 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54193 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54194 }
54195 return NewOpNode;
54196}
54197
54198/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54199/// the codegen.
54200/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54201/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54202/// anything that is guaranteed to be transformed by DAGCombiner.
54204 const X86Subtarget &Subtarget,
54205 const SDLoc &DL) {
54206 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54207 SDValue Src = N->getOperand(0);
54208 unsigned SrcOpcode = Src.getOpcode();
54209 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54210
54211 EVT VT = N->getValueType(0);
54212 EVT SrcVT = Src.getValueType();
54213
54214 auto IsFreeTruncation = [VT](SDValue Op) {
54215 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54216
54217 // See if this has been extended from a smaller/equal size to
54218 // the truncation size, allowing a truncation to combine with the extend.
54219 unsigned Opcode = Op.getOpcode();
54220 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54221 Opcode == ISD::ZERO_EXTEND) &&
54222 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54223 return true;
54224
54225 // See if this is a single use constant which can be constant folded.
54226 // NOTE: We don't peek throught bitcasts here because there is currently
54227 // no support for constant folding truncate+bitcast+vector_of_constants. So
54228 // we'll just send up with a truncate on both operands which will
54229 // get turned back into (truncate (binop)) causing an infinite loop.
54230 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54231 };
54232
54233 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54234 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54235 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54236 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54237 };
54238
54239 // Don't combine if the operation has other uses.
54240 if (!Src.hasOneUse())
54241 return SDValue();
54242
54243 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54244 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54245
54246 if (!VT.isVector())
54247 return SDValue();
54248
54249 // In most cases its only worth pre-truncating if we're only facing the cost
54250 // of one truncation.
54251 // i.e. if one of the inputs will constant fold or the input is repeated.
54252 switch (SrcOpcode) {
54253 case ISD::MUL:
54254 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54255 // better to truncate if we have the chance.
54256 if (SrcVT.getScalarType() == MVT::i64 &&
54257 TLI.isOperationLegal(SrcOpcode, VT) &&
54258 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54259 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54260 [[fallthrough]];
54261 case ISD::AND:
54262 case ISD::XOR:
54263 case ISD::OR:
54264 case ISD::ADD:
54265 case ISD::SUB: {
54266 SDValue Op0 = Src.getOperand(0);
54267 SDValue Op1 = Src.getOperand(1);
54268 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54269 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54270 return TruncateArithmetic(Op0, Op1);
54271 break;
54272 }
54273 }
54274
54275 return SDValue();
54276}
54277
54278// Try to form a MULHU or MULHS node by looking for
54279// (trunc (srl (mul ext, ext), >= 16))
54280// TODO: This is X86 specific because we want to be able to handle wide types
54281// before type legalization. But we can only do it if the vector will be
54282// legalized via widening/splitting. Type legalization can't handle promotion
54283// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54284// combiner.
54285static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54286 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54287 using namespace llvm::SDPatternMatch;
54288
54289 if (!Subtarget.hasSSE2())
54290 return SDValue();
54291
54292 // Only handle vXi16 types that are at least 128-bits unless they will be
54293 // widened.
54294 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54295 return SDValue();
54296
54297 // Input type should be at least vXi32.
54298 EVT InVT = Src.getValueType();
54299 if (InVT.getVectorElementType().getSizeInBits() < 32)
54300 return SDValue();
54301
54302 // First instruction should be a right shift by 16 of a multiply.
54303 SDValue LHS, RHS;
54304 APInt ShiftAmt;
54305 if (!sd_match(Src,
54306 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54307 return SDValue();
54308
54309 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54310 return SDValue();
54311
54312 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54313
54314 // Count leading sign/zero bits on both inputs - if there are enough then
54315 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54316 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54317 // truncations may actually be free by peeking through to the ext source.
54318 auto IsSext = [&DAG](SDValue V) {
54319 return DAG.ComputeMaxSignificantBits(V) <= 16;
54320 };
54321 auto IsZext = [&DAG](SDValue V) {
54322 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54323 };
54324
54325 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54326 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54327 if (!IsSigned && !IsUnsigned)
54328 return SDValue();
54329
54330 // Check if both inputs are extensions, which will be removed by truncation.
54331 auto isOpTruncateFree = [](SDValue Op) {
54332 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54333 Op.getOpcode() == ISD::ZERO_EXTEND)
54334 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54335 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54336 };
54337 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54338
54339 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54340 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54341 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54342 // will have to split anyway.
54343 unsigned InSizeInBits = InVT.getSizeInBits();
54344 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54345 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54346 (InSizeInBits % 16) == 0) {
54347 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54348 InVT.getSizeInBits() / 16);
54349 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54350 DAG.getBitcast(BCVT, RHS));
54351 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54352 return DAG.getNode(ISD::SRL, DL, VT, Res,
54353 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54354 }
54355
54356 // Truncate back to source type.
54357 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54358 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54359
54360 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54361 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54362 return DAG.getNode(ISD::SRL, DL, VT, Res,
54363 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54364}
54365
54366// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54367// from one vector with signed bytes from another vector, adds together
54368// adjacent pairs of 16-bit products, and saturates the result before
54369// truncating to 16-bits.
54370//
54371// Which looks something like this:
54372// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54373// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54375 const X86Subtarget &Subtarget,
54376 const SDLoc &DL) {
54377 if (!VT.isVector() || !Subtarget.hasSSSE3())
54378 return SDValue();
54379
54380 unsigned NumElems = VT.getVectorNumElements();
54381 EVT ScalarVT = VT.getVectorElementType();
54382 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54383 return SDValue();
54384
54385 SDValue SSatVal = detectSSatPattern(In, VT);
54386 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54387 return SDValue();
54388
54389 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54390 // of multiplies from even/odd elements.
54391 SDValue N0 = SSatVal.getOperand(0);
54392 SDValue N1 = SSatVal.getOperand(1);
54393
54394 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54395 return SDValue();
54396
54397 SDValue N00 = N0.getOperand(0);
54398 SDValue N01 = N0.getOperand(1);
54399 SDValue N10 = N1.getOperand(0);
54400 SDValue N11 = N1.getOperand(1);
54401
54402 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54403 // Canonicalize zero_extend to LHS.
54404 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54405 std::swap(N00, N01);
54406 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54407 std::swap(N10, N11);
54408
54409 // Ensure we have a zero_extend and a sign_extend.
54410 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54411 N01.getOpcode() != ISD::SIGN_EXTEND ||
54412 N10.getOpcode() != ISD::ZERO_EXTEND ||
54413 N11.getOpcode() != ISD::SIGN_EXTEND)
54414 return SDValue();
54415
54416 // Peek through the extends.
54417 N00 = N00.getOperand(0);
54418 N01 = N01.getOperand(0);
54419 N10 = N10.getOperand(0);
54420 N11 = N11.getOperand(0);
54421
54422 // Ensure the extend is from vXi8.
54423 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54424 N01.getValueType().getVectorElementType() != MVT::i8 ||
54425 N10.getValueType().getVectorElementType() != MVT::i8 ||
54426 N11.getValueType().getVectorElementType() != MVT::i8)
54427 return SDValue();
54428
54429 // All inputs should be build_vectors.
54430 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54431 N01.getOpcode() != ISD::BUILD_VECTOR ||
54432 N10.getOpcode() != ISD::BUILD_VECTOR ||
54434 return SDValue();
54435
54436 // N00/N10 are zero extended. N01/N11 are sign extended.
54437
54438 // For each element, we need to ensure we have an odd element from one vector
54439 // multiplied by the odd element of another vector and the even element from
54440 // one of the same vectors being multiplied by the even element from the
54441 // other vector. So we need to make sure for each element i, this operator
54442 // is being performed:
54443 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54444 SDValue ZExtIn, SExtIn;
54445 for (unsigned i = 0; i != NumElems; ++i) {
54446 SDValue N00Elt = N00.getOperand(i);
54447 SDValue N01Elt = N01.getOperand(i);
54448 SDValue N10Elt = N10.getOperand(i);
54449 SDValue N11Elt = N11.getOperand(i);
54450 // TODO: Be more tolerant to undefs.
54451 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54452 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54453 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54455 return SDValue();
54456 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54457 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54458 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54459 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54460 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54461 return SDValue();
54462 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54463 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54464 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54465 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54466 // Add is commutative so indices can be reordered.
54467 if (IdxN00 > IdxN10) {
54468 std::swap(IdxN00, IdxN10);
54469 std::swap(IdxN01, IdxN11);
54470 }
54471 // N0 indices be the even element. N1 indices must be the next odd element.
54472 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54473 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54474 return SDValue();
54475 SDValue N00In = N00Elt.getOperand(0);
54476 SDValue N01In = N01Elt.getOperand(0);
54477 SDValue N10In = N10Elt.getOperand(0);
54478 SDValue N11In = N11Elt.getOperand(0);
54479 // First time we find an input capture it.
54480 if (!ZExtIn) {
54481 ZExtIn = N00In;
54482 SExtIn = N01In;
54483 }
54484 if (ZExtIn != N00In || SExtIn != N01In ||
54485 ZExtIn != N10In || SExtIn != N11In)
54486 return SDValue();
54487 }
54488
54489 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54490 EVT ExtVT = Ext.getValueType();
54491 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54492 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54493 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54494 DAG.getVectorIdxConstant(0, DL));
54495 }
54496 };
54497 ExtractVec(ZExtIn);
54498 ExtractVec(SExtIn);
54499
54500 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54501 ArrayRef<SDValue> Ops) {
54502 // Shrink by adding truncate nodes and let DAGCombine fold with the
54503 // sources.
54504 EVT InVT = Ops[0].getValueType();
54505 assert(InVT.getScalarType() == MVT::i8 &&
54506 "Unexpected scalar element type");
54507 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54508 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54509 InVT.getVectorNumElements() / 2);
54510 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54511 };
54512 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54513 PMADDBuilder);
54514}
54515
54517 const X86Subtarget &Subtarget) {
54518 EVT VT = N->getValueType(0);
54519 SDValue Src = N->getOperand(0);
54520 SDLoc DL(N);
54521
54522 // Attempt to pre-truncate inputs to arithmetic ops instead.
54523 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54524 return V;
54525
54526 // Try to detect PMADD
54527 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54528 return PMAdd;
54529
54530 // Try to combine truncation with signed/unsigned saturation.
54531 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54532 return Val;
54533
54534 // Try to combine PMULHUW/PMULHW for vXi16.
54535 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54536 return V;
54537
54538 // The bitcast source is a direct mmx result.
54539 // Detect bitcasts between i32 to x86mmx
54540 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54541 SDValue BCSrc = Src.getOperand(0);
54542 if (BCSrc.getValueType() == MVT::x86mmx)
54543 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54544 }
54545
54546 return SDValue();
54547}
54548
54551 EVT VT = N->getValueType(0);
54552 SDValue In = N->getOperand(0);
54553 SDLoc DL(N);
54554
54555 if (SDValue SSatVal = detectSSatPattern(In, VT))
54556 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54557 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54558 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54559
54560 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54561 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54562 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54563 return SDValue(N, 0);
54564
54565 return SDValue();
54566}
54567
54568/// Returns the negated value if the node \p N flips sign of FP value.
54569///
54570/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54571/// or FSUB(0, x)
54572/// AVX512F does not have FXOR, so FNEG is lowered as
54573/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54574/// In this case we go though all bitcasts.
54575/// This also recognizes splat of a negated value and returns the splat of that
54576/// value.
54577static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54578 if (N->getOpcode() == ISD::FNEG)
54579 return N->getOperand(0);
54580
54581 // Don't recurse exponentially.
54583 return SDValue();
54584
54585 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54586
54588 EVT VT = Op->getValueType(0);
54589
54590 // Make sure the element size doesn't change.
54591 if (VT.getScalarSizeInBits() != ScalarSize)
54592 return SDValue();
54593
54594 unsigned Opc = Op.getOpcode();
54595 switch (Opc) {
54596 case ISD::VECTOR_SHUFFLE: {
54597 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54598 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54599 if (!Op.getOperand(1).isUndef())
54600 return SDValue();
54601 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54602 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54603 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54604 cast<ShuffleVectorSDNode>(Op)->getMask());
54605 break;
54606 }
54608 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54609 // -V, INDEX).
54610 SDValue InsVector = Op.getOperand(0);
54611 SDValue InsVal = Op.getOperand(1);
54612 if (!InsVector.isUndef())
54613 return SDValue();
54614 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54615 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54616 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54617 NegInsVal, Op.getOperand(2));
54618 break;
54619 }
54620 case ISD::FSUB:
54621 case ISD::XOR:
54622 case X86ISD::FXOR: {
54623 SDValue Op1 = Op.getOperand(1);
54624 SDValue Op0 = Op.getOperand(0);
54625
54626 // For XOR and FXOR, we want to check if constant
54627 // bits of Op1 are sign bit masks. For FSUB, we
54628 // have to check if constant bits of Op0 are sign
54629 // bit masks and hence we swap the operands.
54630 if (Opc == ISD::FSUB)
54631 std::swap(Op0, Op1);
54632
54633 APInt UndefElts;
54634 SmallVector<APInt, 16> EltBits;
54635 // Extract constant bits and see if they are all
54636 // sign bit masks. Ignore the undef elements.
54637 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54638 /* AllowWholeUndefs */ true,
54639 /* AllowPartialUndefs */ false)) {
54640 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54641 if (!UndefElts[I] && !EltBits[I].isSignMask())
54642 return SDValue();
54643
54644 // Only allow bitcast from correctly-sized constant.
54645 Op0 = peekThroughBitcasts(Op0);
54646 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54647 return Op0;
54648 }
54649 break;
54650 } // case
54651 } // switch
54652
54653 return SDValue();
54654}
54655
54656static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54657 bool NegRes) {
54658 if (NegMul) {
54659 switch (Opcode) {
54660 // clang-format off
54661 default: llvm_unreachable("Unexpected opcode");
54662 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54663 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54664 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54665 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54666 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54667 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54668 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54669 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54670 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54671 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54672 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54673 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54674 // clang-format on
54675 }
54676 }
54677
54678 if (NegAcc) {
54679 switch (Opcode) {
54680 // clang-format off
54681 default: llvm_unreachable("Unexpected opcode");
54682 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54683 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54684 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54685 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54686 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54687 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54688 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54689 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54690 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54691 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54692 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54693 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54694 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54695 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54696 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54697 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54698 // clang-format on
54699 }
54700 }
54701
54702 if (NegRes) {
54703 switch (Opcode) {
54704 // For accuracy reason, we never combine fneg and fma under strict FP.
54705 // clang-format off
54706 default: llvm_unreachable("Unexpected opcode");
54707 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54708 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54709 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54710 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54711 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54712 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54713 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54714 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54715 // clang-format on
54716 }
54717 }
54718
54719 return Opcode;
54720}
54721
54722/// Do target-specific dag combines on floating point negations.
54725 const X86Subtarget &Subtarget) {
54726 EVT OrigVT = N->getValueType(0);
54727 SDValue Arg = isFNEG(DAG, N);
54728 if (!Arg)
54729 return SDValue();
54730
54731 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54732 EVT VT = Arg.getValueType();
54733 EVT SVT = VT.getScalarType();
54734 SDLoc DL(N);
54735
54736 // Let legalize expand this if it isn't a legal type yet.
54737 if (!TLI.isTypeLegal(VT))
54738 return SDValue();
54739
54740 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54741 // use of a constant by performing (-0 - A*B) instead.
54742 // FIXME: Check rounding control flags as well once it becomes available.
54743 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54744 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54745 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54746 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54747 Arg.getOperand(1), Zero);
54748 return DAG.getBitcast(OrigVT, NewNode);
54749 }
54750
54752 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54753 if (SDValue NegArg =
54754 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54755 return DAG.getBitcast(OrigVT, NegArg);
54756
54757 return SDValue();
54758}
54759
54761 bool LegalOperations,
54762 bool ForCodeSize,
54764 unsigned Depth) const {
54765 // fneg patterns are removable even if they have multiple uses.
54766 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54768 return DAG.getBitcast(Op.getValueType(), Arg);
54769 }
54770
54771 EVT VT = Op.getValueType();
54772 EVT SVT = VT.getScalarType();
54773 unsigned Opc = Op.getOpcode();
54774 SDNodeFlags Flags = Op.getNode()->getFlags();
54775 switch (Opc) {
54776 case ISD::FMA:
54777 case X86ISD::FMSUB:
54778 case X86ISD::FNMADD:
54779 case X86ISD::FNMSUB:
54780 case X86ISD::FMADD_RND:
54781 case X86ISD::FMSUB_RND:
54782 case X86ISD::FNMADD_RND:
54783 case X86ISD::FNMSUB_RND: {
54784 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54785 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54787 break;
54788
54789 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54790 // if it may have signed zeros.
54791 if (!Flags.hasNoSignedZeros())
54792 break;
54793
54794 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54795 // keep temporary nodes alive.
54796 std::list<HandleSDNode> Handles;
54797
54798 // This is always negatible for free but we might be able to remove some
54799 // extra operand negations as well.
54801 for (int i = 0; i != 3; ++i) {
54802 NewOps[i] = getCheaperNegatedExpression(
54803 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54804 if (!!NewOps[i])
54805 Handles.emplace_back(NewOps[i]);
54806 }
54807
54808 bool NegA = !!NewOps[0];
54809 bool NegB = !!NewOps[1];
54810 bool NegC = !!NewOps[2];
54811 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54812
54813 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54815
54816 // Fill in the non-negated ops with the original values.
54817 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54818 if (!NewOps[i])
54819 NewOps[i] = Op.getOperand(i);
54820 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54821 }
54822 case X86ISD::FRCP:
54823 if (SDValue NegOp0 =
54824 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54825 ForCodeSize, Cost, Depth + 1))
54826 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54827 break;
54828 }
54829
54830 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54831 ForCodeSize, Cost, Depth);
54832}
54833
54835 const X86Subtarget &Subtarget) {
54836 MVT VT = N->getSimpleValueType(0);
54837 // If we have integer vector types available, use the integer opcodes.
54838 if (!VT.isVector() || !Subtarget.hasSSE2())
54839 return SDValue();
54840
54841 SDLoc dl(N);
54843 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54844 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54845 unsigned IntOpcode;
54846 switch (N->getOpcode()) {
54847 // clang-format off
54848 default: llvm_unreachable("Unexpected FP logic op");
54849 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54850 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54851 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54852 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54853 // clang-format on
54854 }
54855 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54856 return DAG.getBitcast(VT, IntOp);
54857}
54858
54859/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54861 if (N->getOpcode() != ISD::XOR)
54862 return SDValue();
54863
54864 SDValue LHS = N->getOperand(0);
54865 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54866 return SDValue();
54867
54869 X86::CondCode(LHS->getConstantOperandVal(0)));
54870 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54871}
54872
54874 const X86Subtarget &Subtarget) {
54875 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54876 "Invalid opcode for combing with CTLZ");
54877 if (Subtarget.hasFastLZCNT())
54878 return SDValue();
54879
54880 EVT VT = N->getValueType(0);
54881 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54882 (VT != MVT::i64 || !Subtarget.is64Bit()))
54883 return SDValue();
54884
54885 SDValue N0 = N->getOperand(0);
54886 SDValue N1 = N->getOperand(1);
54887
54888 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54890 return SDValue();
54891
54892 SDValue OpCTLZ;
54893 SDValue OpSizeTM1;
54894
54895 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54896 OpCTLZ = N1;
54897 OpSizeTM1 = N0;
54898 } else if (N->getOpcode() == ISD::SUB) {
54899 return SDValue();
54900 } else {
54901 OpCTLZ = N0;
54902 OpSizeTM1 = N1;
54903 }
54904
54905 if (!OpCTLZ.hasOneUse())
54906 return SDValue();
54907 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54908 if (!C)
54909 return SDValue();
54910
54911 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54912 return SDValue();
54913 EVT OpVT = VT;
54914 SDValue Op = OpCTLZ.getOperand(0);
54915 if (VT == MVT::i8) {
54916 // Zero extend to i32 since there is not an i8 bsr.
54917 OpVT = MVT::i32;
54918 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54919 }
54920
54921 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54922 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
54923 if (VT == MVT::i8)
54924 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
54925
54926 return Op;
54927}
54928
54931 const X86Subtarget &Subtarget) {
54932 SDValue N0 = N->getOperand(0);
54933 SDValue N1 = N->getOperand(1);
54934 EVT VT = N->getValueType(0);
54935 SDLoc DL(N);
54936
54937 // If this is SSE1 only convert to FXOR to avoid scalarization.
54938 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
54939 return DAG.getBitcast(MVT::v4i32,
54940 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
54941 DAG.getBitcast(MVT::v4f32, N0),
54942 DAG.getBitcast(MVT::v4f32, N1)));
54943 }
54944
54945 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
54946 return Cmp;
54947
54948 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54949 return R;
54950
54951 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54952 return R;
54953
54954 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54955 return R;
54956
54957 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54958 DAG, DCI, Subtarget))
54959 return FPLogic;
54960
54961 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
54962 return R;
54963
54964 if (DCI.isBeforeLegalizeOps())
54965 return SDValue();
54966
54967 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
54968 return SetCC;
54969
54970 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
54971 return R;
54972
54973 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
54974 return RV;
54975
54976 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
54977 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54978 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
54979 N0.getOperand(0).getValueType().isVector() &&
54980 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54981 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
54982 return DAG.getBitcast(
54983 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
54984 }
54985
54986 // Handle AVX512 mask widening.
54987 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
54988 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
54989 VT.getVectorElementType() == MVT::i1 &&
54991 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
54992 return DAG.getNode(
54994 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
54995 N0.getOperand(2));
54996 }
54997
54998 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
54999 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55000 // TODO: Under what circumstances could this be performed in DAGCombine?
55001 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55002 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55003 SDValue TruncExtSrc = N0.getOperand(0);
55004 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55005 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55006 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55007 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55008 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55009 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55010 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55011 }
55012 }
55013
55014 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55015 return R;
55016
55017 return combineFneg(N, DAG, DCI, Subtarget);
55018}
55019
55022 const X86Subtarget &Subtarget) {
55023 SDValue N0 = N->getOperand(0);
55024 EVT VT = N->getValueType(0);
55025
55026 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55027 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55028 SDValue Src = N0.getOperand(0);
55029 EVT SrcVT = Src.getValueType();
55030 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55031 (DCI.isBeforeLegalize() ||
55032 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55033 Subtarget.hasSSSE3()) {
55034 unsigned NumElts = SrcVT.getVectorNumElements();
55035 SmallVector<int, 32> ReverseMask(NumElts);
55036 for (unsigned I = 0; I != NumElts; ++I)
55037 ReverseMask[I] = (NumElts - 1) - I;
55038 SDValue Rev =
55039 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55040 return DAG.getBitcast(VT, Rev);
55041 }
55042 }
55043
55044 return SDValue();
55045}
55046
55047// Various combines to try to convert to avgceilu.
55050 const X86Subtarget &Subtarget) {
55051 unsigned Opcode = N->getOpcode();
55052 SDValue N0 = N->getOperand(0);
55053 SDValue N1 = N->getOperand(1);
55054 EVT VT = N->getValueType(0);
55055 EVT SVT = VT.getScalarType();
55056 SDLoc DL(N);
55057
55058 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55059 // Only useful on vXi8 which doesn't have good SRA handling.
55060 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55062 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55063 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55064 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55065 return DAG.getNode(ISD::XOR, DL, VT,
55066 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55067 }
55068
55069 return SDValue();
55070}
55071
55074 const X86Subtarget &Subtarget) {
55075 EVT VT = N->getValueType(0);
55076 unsigned NumBits = VT.getSizeInBits();
55077
55078 // TODO - Constant Folding.
55079
55080 // Simplify the inputs.
55081 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55082 APInt DemandedMask(APInt::getAllOnes(NumBits));
55083 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55084 return SDValue(N, 0);
55085
55086 return SDValue();
55087}
55088
55090 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55091}
55092
55093/// If a value is a scalar FP zero or a vector FP zero (potentially including
55094/// undefined elements), return a zero constant that may be used to fold away
55095/// that value. In the case of a vector, the returned constant will not contain
55096/// undefined elements even if the input parameter does. This makes it suitable
55097/// to be used as a replacement operand with operations (eg, bitwise-and) where
55098/// an undef should not propagate.
55100 const X86Subtarget &Subtarget) {
55102 return SDValue();
55103
55104 if (V.getValueType().isVector())
55105 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55106
55107 return V;
55108}
55109
55111 const X86Subtarget &Subtarget) {
55112 SDValue N0 = N->getOperand(0);
55113 SDValue N1 = N->getOperand(1);
55114 EVT VT = N->getValueType(0);
55115 SDLoc DL(N);
55116
55117 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55118 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55119 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55120 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55121 return SDValue();
55122
55123 auto isAllOnesConstantFP = [](SDValue V) {
55124 if (V.getSimpleValueType().isVector())
55125 return ISD::isBuildVectorAllOnes(V.getNode());
55126 auto *C = dyn_cast<ConstantFPSDNode>(V);
55127 return C && C->getConstantFPValue()->isAllOnesValue();
55128 };
55129
55130 // fand (fxor X, -1), Y --> fandn X, Y
55131 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55132 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55133
55134 // fand X, (fxor Y, -1) --> fandn Y, X
55135 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55136 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55137
55138 return SDValue();
55139}
55140
55141/// Do target-specific dag combines on X86ISD::FAND nodes.
55143 const X86Subtarget &Subtarget) {
55144 // FAND(0.0, x) -> 0.0
55145 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55146 return V;
55147
55148 // FAND(x, 0.0) -> 0.0
55149 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55150 return V;
55151
55152 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55153 return V;
55154
55155 return lowerX86FPLogicOp(N, DAG, Subtarget);
55156}
55157
55158/// Do target-specific dag combines on X86ISD::FANDN nodes.
55160 const X86Subtarget &Subtarget) {
55161 // FANDN(0.0, x) -> x
55162 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55163 return N->getOperand(1);
55164
55165 // FANDN(x, 0.0) -> 0.0
55166 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55167 return V;
55168
55169 return lowerX86FPLogicOp(N, DAG, Subtarget);
55170}
55171
55172/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55175 const X86Subtarget &Subtarget) {
55176 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55177
55178 // F[X]OR(0.0, x) -> x
55179 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55180 return N->getOperand(1);
55181
55182 // F[X]OR(x, 0.0) -> x
55183 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55184 return N->getOperand(0);
55185
55186 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55187 return NewVal;
55188
55189 return lowerX86FPLogicOp(N, DAG, Subtarget);
55190}
55191
55192/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55194 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55195
55196 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55197 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55199 return SDValue();
55200
55201 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55202 // into FMINC and FMAXC, which are Commutative operations.
55203 unsigned NewOp = 0;
55204 switch (N->getOpcode()) {
55205 default: llvm_unreachable("unknown opcode");
55206 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55207 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55208 }
55209
55210 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55211 N->getOperand(0), N->getOperand(1));
55212}
55213
55215 const X86Subtarget &Subtarget) {
55216 EVT VT = N->getValueType(0);
55217 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55218 return SDValue();
55219
55220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55221
55222 auto IsMinMaxLegal = [&](EVT VT) {
55223 if (!TLI.isTypeLegal(VT))
55224 return false;
55225 return VT.getScalarType() != MVT::f16 ||
55226 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55227 };
55228
55229 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55230 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55231 (Subtarget.hasFP16() && VT == MVT::f16) ||
55232 (VT.isVector() && IsMinMaxLegal(VT))))
55233 return SDValue();
55234
55235 SDValue Op0 = N->getOperand(0);
55236 SDValue Op1 = N->getOperand(1);
55237 SDLoc DL(N);
55238 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55239
55240 // If we don't have to respect NaN inputs, this is a direct translation to x86
55241 // min/max instructions.
55242 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55243 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55244
55245 // If one of the operands is known non-NaN use the native min/max instructions
55246 // with the non-NaN input as second operand.
55247 if (DAG.isKnownNeverNaN(Op1))
55248 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55249 if (DAG.isKnownNeverNaN(Op0))
55250 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55251
55252 // If we have to respect NaN inputs, this takes at least 3 instructions.
55253 // Favor a library call when operating on a scalar and minimizing code size.
55254 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55255 return SDValue();
55256
55257 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55258 VT);
55259
55260 // There are 4 possibilities involving NaN inputs, and these are the required
55261 // outputs:
55262 // Op1
55263 // Num NaN
55264 // ----------------
55265 // Num | Max | Op0 |
55266 // Op0 ----------------
55267 // NaN | Op1 | NaN |
55268 // ----------------
55269 //
55270 // The SSE FP max/min instructions were not designed for this case, but rather
55271 // to implement:
55272 // Min = Op1 < Op0 ? Op1 : Op0
55273 // Max = Op1 > Op0 ? Op1 : Op0
55274 //
55275 // So they always return Op0 if either input is a NaN. However, we can still
55276 // use those instructions for fmaxnum by selecting away a NaN input.
55277
55278 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55279 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55280 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55281
55282 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55283 // are NaN, the NaN value of Op1 is the result.
55284 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55285}
55286
55289 EVT VT = N->getValueType(0);
55290 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55291
55292 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55293 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55294 return SDValue(N, 0);
55295
55296 // Convert a full vector load into vzload when not all bits are needed.
55297 SDValue In = N->getOperand(0);
55298 MVT InVT = In.getSimpleValueType();
55299 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55300 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55301 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55302 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55303 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55304 MVT MemVT = MVT::getIntegerVT(NumBits);
55305 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55306 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55307 SDLoc dl(N);
55308 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55309 DAG.getBitcast(InVT, VZLoad));
55310 DCI.CombineTo(N, Convert);
55311 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55313 return SDValue(N, 0);
55314 }
55315 }
55316
55317 return SDValue();
55318}
55319
55323 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55324 EVT VT = N->getValueType(0);
55325
55326 // Convert a full vector load into vzload when not all bits are needed.
55327 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55328 MVT InVT = In.getSimpleValueType();
55329 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55330 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55331 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55332 LoadSDNode *LN = cast<LoadSDNode>(In);
55333 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55334 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55335 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55336 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55337 SDLoc dl(N);
55338 if (IsStrict) {
55339 SDValue Convert =
55340 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55341 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55342 DCI.CombineTo(N, Convert, Convert.getValue(1));
55343 } else {
55344 SDValue Convert =
55345 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55346 DCI.CombineTo(N, Convert);
55347 }
55348 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55350 return SDValue(N, 0);
55351 }
55352 }
55353
55354 return SDValue();
55355}
55356
55357/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55360 const X86Subtarget &Subtarget) {
55361 SDValue N0 = N->getOperand(0);
55362 SDValue N1 = N->getOperand(1);
55363 MVT VT = N->getSimpleValueType(0);
55364 int NumElts = VT.getVectorNumElements();
55365 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55366 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55367 SDLoc DL(N);
55368
55369 // ANDNP(undef, x) -> 0
55370 // ANDNP(x, undef) -> 0
55371 if (N0.isUndef() || N1.isUndef())
55372 return DAG.getConstant(0, DL, VT);
55373
55374 // ANDNP(0, x) -> x
55376 return N1;
55377
55378 // ANDNP(x, 0) -> 0
55380 return DAG.getConstant(0, DL, VT);
55381
55382 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55384 return DAG.getNOT(DL, N0, VT);
55385
55386 // Turn ANDNP back to AND if input is inverted.
55387 if (SDValue Not = IsNOT(N0, DAG))
55388 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55389
55390 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55391 // to make use of predicated selects.
55392 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55393 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55394 SDValue Src = N0.getOperand(0);
55395 EVT SrcVT = Src.getValueType();
55396 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55397 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55398 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55399 getZeroVector(VT, Subtarget, DAG, DL));
55400 }
55401
55402 // Constant Folding
55403 APInt Undefs0, Undefs1;
55404 SmallVector<APInt> EltBits0, EltBits1;
55405 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55406 /*AllowWholeUndefs*/ true,
55407 /*AllowPartialUndefs*/ true)) {
55408 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55409 /*AllowWholeUndefs*/ true,
55410 /*AllowPartialUndefs*/ true)) {
55411 SmallVector<APInt> ResultBits;
55412 for (int I = 0; I != NumElts; ++I)
55413 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55414 return getConstVector(ResultBits, VT, DAG, DL);
55415 }
55416
55417 // Constant fold NOT(N0) to allow us to use AND.
55418 // Ensure this is only performed if we can confirm that the bitcasted source
55419 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55420 if (N0->hasOneUse()) {
55422 if (BC0.getOpcode() != ISD::BITCAST) {
55423 for (APInt &Elt : EltBits0)
55424 Elt = ~Elt;
55425 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55426 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55427 }
55428 }
55429 }
55430
55431 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55432 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55433 SDValue Op(N, 0);
55434 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55435 return Res;
55436
55437 // If either operand is a constant mask, then only the elements that aren't
55438 // zero are actually demanded by the other operand.
55439 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55440 APInt UndefElts;
55441 SmallVector<APInt> EltBits;
55442 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55443 APInt DemandedElts = APInt::getAllOnes(NumElts);
55444 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55445 EltBits)) {
55446 DemandedBits.clearAllBits();
55447 DemandedElts.clearAllBits();
55448 for (int I = 0; I != NumElts; ++I) {
55449 if (UndefElts[I]) {
55450 // We can't assume an undef src element gives an undef dst - the
55451 // other src might be zero.
55452 DemandedBits.setAllBits();
55453 DemandedElts.setBit(I);
55454 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55455 (!Invert && !EltBits[I].isZero())) {
55456 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55457 DemandedElts.setBit(I);
55458 }
55459 }
55460 }
55461 return std::make_pair(DemandedBits, DemandedElts);
55462 };
55463 APInt Bits0, Elts0;
55464 APInt Bits1, Elts1;
55465 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55466 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55467
55468 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55469 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55470 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55471 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55472 if (N->getOpcode() != ISD::DELETED_NODE)
55473 DCI.AddToWorklist(N);
55474 return SDValue(N, 0);
55475 }
55476 }
55477
55478 // Folds for better commutativity:
55479 if (N1->hasOneUse()) {
55480 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55481 if (SDValue Not = IsNOT(N1, DAG))
55482 return DAG.getNOT(
55483 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55484
55485 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55486 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55487 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55489 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55490 EVT ShufVT = BC1.getValueType();
55491 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55492 DAG.getBitcast(ShufVT, N0));
55493 SDValue NewShuf =
55494 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55495 return DAG.getBitcast(VT, NewShuf);
55496 }
55497 }
55498 }
55499
55500 return SDValue();
55501}
55502
55505 SDValue N1 = N->getOperand(1);
55506
55507 // BT ignores high bits in the bit index operand.
55508 unsigned BitWidth = N1.getValueSizeInBits();
55510 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55511 if (N->getOpcode() != ISD::DELETED_NODE)
55512 DCI.AddToWorklist(N);
55513 return SDValue(N, 0);
55514 }
55515
55516 return SDValue();
55517}
55518
55521 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55522 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55523
55524 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55525 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55526 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55527 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55528 if (N->getOpcode() != ISD::DELETED_NODE)
55529 DCI.AddToWorklist(N);
55530 return SDValue(N, 0);
55531 }
55532
55533 // Convert a full vector load into vzload when not all bits are needed.
55534 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55535 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55536 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55537 SDLoc dl(N);
55538 if (IsStrict) {
55539 SDValue Convert = DAG.getNode(
55540 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55541 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55542 DCI.CombineTo(N, Convert, Convert.getValue(1));
55543 } else {
55544 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55545 DAG.getBitcast(MVT::v8i16, VZLoad));
55546 DCI.CombineTo(N, Convert);
55547 }
55548
55549 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55551 return SDValue(N, 0);
55552 }
55553 }
55554 }
55555
55556 return SDValue();
55557}
55558
55559// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55561 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55562
55563 EVT DstVT = N->getValueType(0);
55564
55565 SDValue N0 = N->getOperand(0);
55566 SDValue N1 = N->getOperand(1);
55567 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55568
55569 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55570 return SDValue();
55571
55572 // Look through single use any_extends / truncs.
55573 SDValue IntermediateBitwidthOp;
55574 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55575 N0.hasOneUse()) {
55576 IntermediateBitwidthOp = N0;
55577 N0 = N0.getOperand(0);
55578 }
55579
55580 // See if we have a single use cmov.
55581 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55582 return SDValue();
55583
55584 SDValue CMovOp0 = N0.getOperand(0);
55585 SDValue CMovOp1 = N0.getOperand(1);
55586
55587 // Make sure both operands are constants.
55588 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55589 !isa<ConstantSDNode>(CMovOp1.getNode()))
55590 return SDValue();
55591
55592 SDLoc DL(N);
55593
55594 // If we looked through an any_extend/trunc above, add one to the constants.
55595 if (IntermediateBitwidthOp) {
55596 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55597 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55598 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55599 }
55600
55601 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55602 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55603
55604 EVT CMovVT = DstVT;
55605 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55606 if (DstVT == MVT::i16) {
55607 CMovVT = MVT::i32;
55608 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55609 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55610 }
55611
55612 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55613 N0.getOperand(2), N0.getOperand(3));
55614
55615 if (CMovVT != DstVT)
55616 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55617
55618 return CMov;
55619}
55620
55622 const X86Subtarget &Subtarget) {
55623 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55624
55625 if (SDValue V = combineSextInRegCmov(N, DAG))
55626 return V;
55627
55628 EVT VT = N->getValueType(0);
55629 SDValue N0 = N->getOperand(0);
55630 SDValue N1 = N->getOperand(1);
55631 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55632 SDLoc dl(N);
55633
55634 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55635 // both SSE and AVX2 since there is no sign-extended shift right
55636 // operation on a vector with 64-bit elements.
55637 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55638 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55639 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55640 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55641 SDValue N00 = N0.getOperand(0);
55642
55643 // EXTLOAD has a better solution on AVX2,
55644 // it may be replaced with X86ISD::VSEXT node.
55645 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55646 if (!ISD::isNormalLoad(N00.getNode()))
55647 return SDValue();
55648
55649 // Attempt to promote any comparison mask ops before moving the
55650 // SIGN_EXTEND_INREG in the way.
55651 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55652 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55653
55654 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55655 SDValue Tmp =
55656 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55657 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55658 }
55659 }
55660 return SDValue();
55661}
55662
55663/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55664/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55665/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55666/// opportunities to combine math ops, use an LEA, or use a complex addressing
55667/// mode. This can eliminate extend, add, and shift instructions.
55669 const X86Subtarget &Subtarget) {
55670 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55671 Ext->getOpcode() != ISD::ZERO_EXTEND)
55672 return SDValue();
55673
55674 // TODO: This should be valid for other integer types.
55675 EVT VT = Ext->getValueType(0);
55676 if (VT != MVT::i64)
55677 return SDValue();
55678
55679 SDValue Add = Ext->getOperand(0);
55680 if (Add.getOpcode() != ISD::ADD)
55681 return SDValue();
55682
55683 SDValue AddOp0 = Add.getOperand(0);
55684 SDValue AddOp1 = Add.getOperand(1);
55685 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55686 bool NSW = Add->getFlags().hasNoSignedWrap();
55687 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55688 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55689 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55690
55691 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55692 // into the 'zext'
55693 if ((Sext && !NSW) || (!Sext && !NUW))
55694 return SDValue();
55695
55696 // Having a constant operand to the 'add' ensures that we are not increasing
55697 // the instruction count because the constant is extended for free below.
55698 // A constant operand can also become the displacement field of an LEA.
55699 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55700 if (!AddOp1C)
55701 return SDValue();
55702
55703 // Don't make the 'add' bigger if there's no hope of combining it with some
55704 // other 'add' or 'shl' instruction.
55705 // TODO: It may be profitable to generate simpler LEA instructions in place
55706 // of single 'add' instructions, but the cost model for selecting an LEA
55707 // currently has a high threshold.
55708 bool HasLEAPotential = false;
55709 for (auto *User : Ext->users()) {
55710 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55711 HasLEAPotential = true;
55712 break;
55713 }
55714 }
55715 if (!HasLEAPotential)
55716 return SDValue();
55717
55718 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55719 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55720 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55721 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55722
55723 // The wider add is guaranteed to not wrap because both operands are
55724 // sign-extended.
55725 SDNodeFlags Flags;
55726 Flags.setNoSignedWrap(NSW);
55727 Flags.setNoUnsignedWrap(NUW);
55728 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55729}
55730
55731// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55732// operands and the result of CMOV is not used anywhere else - promote CMOV
55733// itself instead of promoting its result. This could be beneficial, because:
55734// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55735// (or more) pseudo-CMOVs only when they go one-after-another and
55736// getting rid of result extension code after CMOV will help that.
55737// 2) Promotion of constant CMOV arguments is free, hence the
55738// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55739// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55740// promotion is also good in terms of code-size.
55741// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55742// promotion).
55744 SDValue CMovN = Extend->getOperand(0);
55745 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55746 return SDValue();
55747
55748 EVT TargetVT = Extend->getValueType(0);
55749 unsigned ExtendOpcode = Extend->getOpcode();
55750 SDLoc DL(Extend);
55751
55752 EVT VT = CMovN.getValueType();
55753 SDValue CMovOp0 = CMovN.getOperand(0);
55754 SDValue CMovOp1 = CMovN.getOperand(1);
55755
55756 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55757 !isa<ConstantSDNode>(CMovOp1.getNode()))
55758 return SDValue();
55759
55760 // Only extend to i32 or i64.
55761 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55762 return SDValue();
55763
55764 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55765 // are free.
55766 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55767 return SDValue();
55768
55769 // If this a zero extend to i64, we should only extend to i32 and use a free
55770 // zero extend to finish.
55771 EVT ExtendVT = TargetVT;
55772 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55773 ExtendVT = MVT::i32;
55774
55775 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55776 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55777
55778 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55779 CMovN.getOperand(2), CMovN.getOperand(3));
55780
55781 // Finish extending if needed.
55782 if (ExtendVT != TargetVT)
55783 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55784
55785 return Res;
55786}
55787
55788// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55789// result type.
55791 const X86Subtarget &Subtarget) {
55792 SDValue N0 = N->getOperand(0);
55793 EVT VT = N->getValueType(0);
55794 SDLoc dl(N);
55795
55796 // Only do this combine with AVX512 for vector extends.
55797 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55798 return SDValue();
55799
55800 // Only combine legal element types.
55801 EVT SVT = VT.getVectorElementType();
55802 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55803 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55804 return SDValue();
55805
55806 // We don't have CMPP Instruction for vxf16
55807 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55808 return SDValue();
55809 // We can only do this if the vector size in 256 bits or less.
55810 unsigned Size = VT.getSizeInBits();
55811 if (Size > 256 && Subtarget.useAVX512Regs())
55812 return SDValue();
55813
55814 EVT N00VT = N0.getOperand(0).getValueType();
55815
55816 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55817 // that's the only integer compares with we have.
55818 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
55819 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55820 return SDValue();
55821
55822 // Only do this combine if the extension will be fully consumed by the setcc.
55823 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55824 if (Size != MatchingVecType.getSizeInBits())
55825 return SDValue();
55826
55827 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55828
55829 if (N->getOpcode() == ISD::ZERO_EXTEND)
55830 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55831
55832 return Res;
55833}
55834
55837 const X86Subtarget &Subtarget) {
55838 SDValue N0 = N->getOperand(0);
55839 EVT VT = N->getValueType(0);
55840 SDLoc DL(N);
55841
55842 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55843 if (!DCI.isBeforeLegalizeOps() &&
55845 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55846 N0->getOperand(1));
55847 bool ReplaceOtherUses = !N0.hasOneUse();
55848 DCI.CombineTo(N, Setcc);
55849 // Replace other uses with a truncate of the widened setcc_carry.
55850 if (ReplaceOtherUses) {
55851 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55852 N0.getValueType(), Setcc);
55853 DCI.CombineTo(N0.getNode(), Trunc);
55854 }
55855
55856 return SDValue(N, 0);
55857 }
55858
55859 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55860 return NewCMov;
55861
55862 if (!DCI.isBeforeLegalizeOps())
55863 return SDValue();
55864
55865 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55866 return V;
55867
55868 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55869 DAG, DCI, Subtarget))
55870 return V;
55871
55872 if (VT.isVector()) {
55873 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55874 return R;
55875
55877 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55878 }
55879
55880 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55881 return NewAdd;
55882
55883 return SDValue();
55884}
55885
55886// Inverting a constant vector is profitable if it can be eliminated and the
55887// inverted vector is already present in DAG. Otherwise, it will be loaded
55888// anyway.
55889//
55890// We determine which of the values can be completely eliminated and invert it.
55891// If both are eliminable, select a vector with the first negative element.
55894 "ConstantFP build vector expected");
55895 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55896 // can eliminate it. Since this function is invoked for each FMA with this
55897 // vector.
55898 auto IsNotFMA = [](SDNode *User) {
55899 return User->getOpcode() != ISD::FMA &&
55900 User->getOpcode() != ISD::STRICT_FMA;
55901 };
55902 if (llvm::any_of(V->users(), IsNotFMA))
55903 return SDValue();
55904
55906 EVT VT = V.getValueType();
55907 EVT EltVT = VT.getVectorElementType();
55908 for (const SDValue &Op : V->op_values()) {
55909 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55910 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55911 } else {
55912 assert(Op.isUndef());
55913 Ops.push_back(DAG.getUNDEF(EltVT));
55914 }
55915 }
55916
55917 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
55918 if (!NV)
55919 return SDValue();
55920
55921 // If an inverted version cannot be eliminated, choose it instead of the
55922 // original version.
55923 if (llvm::any_of(NV->users(), IsNotFMA))
55924 return SDValue(NV, 0);
55925
55926 // If the inverted version also can be eliminated, we have to consistently
55927 // prefer one of the values. We prefer a constant with a negative value on
55928 // the first place.
55929 // N.B. We need to skip undefs that may precede a value.
55930 for (const SDValue &Op : V->op_values()) {
55931 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55932 if (Cst->isNegative())
55933 return SDValue();
55934 break;
55935 }
55936 }
55937 return SDValue(NV, 0);
55938}
55939
55942 const X86Subtarget &Subtarget) {
55943 SDLoc dl(N);
55944 EVT VT = N->getValueType(0);
55946 bool IsStrict = N->isTargetOpcode()
55947 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55948 : N->isStrictFPOpcode();
55949
55950 // Let legalize expand this if it isn't a legal type yet.
55951 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55952 if (!TLI.isTypeLegal(VT))
55953 return SDValue();
55954
55955 SDValue A = N->getOperand(IsStrict ? 1 : 0);
55956 SDValue B = N->getOperand(IsStrict ? 2 : 1);
55957 SDValue C = N->getOperand(IsStrict ? 3 : 2);
55958
55959 // If the operation allows fast-math and the target does not support FMA,
55960 // split this into mul+add to avoid libcall(s).
55961 SDNodeFlags Flags = N->getFlags();
55962 if (!IsStrict && Flags.hasAllowReassociation() &&
55963 TLI.isOperationExpand(ISD::FMA, VT)) {
55964 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
55965 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
55966 }
55967
55968 EVT ScalarVT = VT.getScalarType();
55969 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
55970 !Subtarget.hasAnyFMA()) &&
55971 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
55972 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
55973 return SDValue();
55974
55975 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
55977 bool LegalOperations = !DCI.isBeforeLegalizeOps();
55978 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
55979 CodeSize)) {
55980 V = NegV;
55981 return true;
55982 }
55983 // Look through extract_vector_elts. If it comes from an FNEG, create a
55984 // new extract from the FNEG input.
55985 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55986 isNullConstant(V.getOperand(1))) {
55987 SDValue Vec = V.getOperand(0);
55988 if (SDValue NegV = TLI.getCheaperNegatedExpression(
55989 Vec, DAG, LegalOperations, CodeSize)) {
55990 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
55991 NegV, V.getOperand(1));
55992 return true;
55993 }
55994 }
55995 // Lookup if there is an inverted version of constant vector V in DAG.
55996 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
55997 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
55998 V = NegV;
55999 return true;
56000 }
56001 }
56002 return false;
56003 };
56004
56005 // Do not convert the passthru input of scalar intrinsics.
56006 // FIXME: We could allow negations of the lower element only.
56007 bool NegA = invertIfNegative(A);
56008 // Create a dummy use for A so that in the process of negating B or C
56009 // recursively, it is not deleted.
56010 HandleSDNode NegAHandle(A);
56011 bool NegB = invertIfNegative(B);
56012 // Similar to A, get a handle on B.
56013 HandleSDNode NegBHandle(B);
56014 bool NegC = invertIfNegative(C);
56015
56016 if (!NegA && !NegB && !NegC)
56017 return SDValue();
56018
56019 unsigned NewOpcode =
56020 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56021
56022 // Propagate fast-math-flags to new FMA node.
56023 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56024 if (IsStrict) {
56025 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56026 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56027 {N->getOperand(0), A, B, C});
56028 } else {
56029 if (N->getNumOperands() == 4)
56030 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56031 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56032 }
56033}
56034
56035// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56036// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56039 SDLoc dl(N);
56040 EVT VT = N->getValueType(0);
56041 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56043 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56044
56045 SDValue N2 = N->getOperand(2);
56046
56047 SDValue NegN2 =
56048 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56049 if (!NegN2)
56050 return SDValue();
56051 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56052
56053 if (N->getNumOperands() == 4)
56054 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56055 NegN2, N->getOperand(3));
56056 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56057 NegN2);
56058}
56059
56060// Try to widen the build vector and bitcast it to the type of zext.
56061// This is a special case for the 128-bit vector types. Intention is to remove
56062// the zext and replace it with a bitcast the wider type. While lowering
56063// the bitcast is removed and extra commutation due to zext is avoided.
56064// For example:
56065// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56066// build_vector (x, 0, y, 0, z, w, 0)
56068
56069 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56070 return SDValue();
56071
56072 EVT ExtendVT = Extend->getValueType(0);
56073
56074 SDValue BV = Extend->getOperand(0);
56075 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56076 return SDValue();
56077
56078 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56079 // If the build vector has undef elements, we cannot widen it.
56080 // The widening would create a vector with more undef elements, which
56081 // is not valid.
56082 return SDValue();
56083 }
56084
56085 if (!all_of(BV->op_values(),
56086 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56087 // If the build vector any element other than \ISD::LOAD, we cannot widen
56088 // it.
56089 return SDValue();
56090 }
56091
56092 SDLoc dl(BV);
56093 EVT VT = BV.getValueType();
56094 EVT EltVT = BV.getOperand(0).getValueType();
56095 unsigned NumElts = VT.getVectorNumElements();
56096
56097 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56098
56099 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56101 return SDValue();
56102
56103 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56104 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56105
56106 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56107 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56108 // Fill the new elements with Zero.
56109 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56110 // Compute the step to place the elements in the right place and control the
56111 // iteration.
56112 unsigned step = WidenNumElts / NumElts;
56113 if (WidenVT.is128BitVector()) {
56114 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56115 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56116 i--, j -= step) {
56117 SDValue temp = NewOps[i];
56118 NewOps[i] = NewOps[j];
56119 NewOps[j] = temp;
56120 }
56121 // Create new build vector with WidenVT and NewOps
56122 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56123 // Replace the old build vector with the new one. Bitcast the
56124 // new build vector to the type of the zext.
56125 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56126 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56127 return NewBV;
56128 }
56129 }
56130 return SDValue();
56131}
56132
56135 const X86Subtarget &Subtarget) {
56136 SDLoc dl(N);
56137 SDValue N0 = N->getOperand(0);
56138 EVT VT = N->getValueType(0);
56139
56140 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56141 // FIXME: Is this needed? We don't seem to have any tests for it.
56142 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56144 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56145 N0->getOperand(1));
56146 bool ReplaceOtherUses = !N0.hasOneUse();
56147 DCI.CombineTo(N, Setcc);
56148 // Replace other uses with a truncate of the widened setcc_carry.
56149 if (ReplaceOtherUses) {
56150 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56151 N0.getValueType(), Setcc);
56152 DCI.CombineTo(N0.getNode(), Trunc);
56153 }
56154
56155 return SDValue(N, 0);
56156 }
56157
56158 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56159 return NewCMov;
56160
56161 if (DCI.isBeforeLegalizeOps())
56162 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56163 return V;
56164
56165 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56166 DAG, DCI, Subtarget))
56167 return V;
56168
56169 if (VT.isVector())
56170 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56171 return R;
56172
56173 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56174 return NewAdd;
56175
56176 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56177 return R;
56178
56179 // TODO: Combine with any target/faux shuffle.
56180 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56182 SDValue N00 = N0.getOperand(0);
56183 SDValue N01 = N0.getOperand(1);
56184 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56185 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56186 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56187 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56188 return concatSubVectors(N00, N01, DAG, dl);
56189 }
56190 }
56191
56192 if (SDValue V = widenBuildVec(N, DAG))
56193 return V;
56194
56195 return SDValue();
56196}
56197
56198/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56199/// pre-promote its result type since vXi1 vectors don't get promoted
56200/// during type legalization.
56202 SDValue RHS, ISD::CondCode CC,
56203 const SDLoc &DL, SelectionDAG &DAG,
56204 const X86Subtarget &Subtarget) {
56205 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56206 VT.getVectorElementType() == MVT::i1 &&
56207 (OpVT.getVectorElementType() == MVT::i8 ||
56208 OpVT.getVectorElementType() == MVT::i16)) {
56209 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56210 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56211 }
56212 return SDValue();
56213}
56214
56215// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56216// eq/ne) is generated when using an integer as a mask. Instead of generating a
56217// broadcast + vptest, we can directly move the integer to a mask register.
56219 const SDLoc &DL, SelectionDAG &DAG,
56220 const X86Subtarget &Subtarget) {
56221 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56222 return SDValue();
56223
56224 if (!Subtarget.hasAVX512())
56225 return SDValue();
56226
56227 if (Op0.getOpcode() != ISD::AND)
56228 return SDValue();
56229
56230 SDValue Broadcast = Op0.getOperand(0);
56231 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56232 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56233 return SDValue();
56234
56235 SDValue Load = Op0.getOperand(1);
56236 EVT LoadVT = Load.getSimpleValueType();
56237
56238 APInt UndefElts;
56239 SmallVector<APInt, 32> EltBits;
56241 UndefElts, EltBits,
56242 /*AllowWholeUndefs*/ true,
56243 /*AllowPartialUndefs*/ false) ||
56244 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56245 return SDValue();
56246
56247 // Check if the constant pool contains only powers of 2 starting from some
56248 // 2^N. The table may also contain undefs because of widening of vector
56249 // operands.
56250 unsigned N = EltBits[0].logBase2();
56251 unsigned Len = UndefElts.getBitWidth();
56252 for (unsigned I = 1; I != Len; ++I) {
56253 if (UndefElts[I]) {
56254 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56255 return SDValue();
56256 break;
56257 }
56258
56259 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56260 return SDValue();
56261 }
56262
56263 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56264 SDValue BroadcastOp;
56265 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56266 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56267 Broadcast, DAG.getVectorIdxConstant(0, DL));
56268 } else {
56269 BroadcastOp = Broadcast.getOperand(0);
56270 if (BroadcastOp.getValueType().isVector())
56271 return SDValue();
56272 }
56273
56274 SDValue Masked = BroadcastOp;
56275 if (N != 0) {
56276 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56277 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56278
56279 if (NumDefinedElts > BroadcastOpBitWidth)
56280 return SDValue();
56281
56282 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56283 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56284 DAG.getConstant(N, DL, BroadcastOpVT));
56285 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56286 DAG.getConstant(Mask, DL, BroadcastOpVT));
56287 }
56288 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56289 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56290 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56291 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56292
56293 if (CC == ISD::SETEQ)
56294 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56295
56296 if (VT != MVT::v16i1)
56297 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56298 DAG.getVectorIdxConstant(0, DL));
56299
56300 return Bitcast;
56301}
56302
56305 const X86Subtarget &Subtarget) {
56306 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56307 const SDValue LHS = N->getOperand(0);
56308 const SDValue RHS = N->getOperand(1);
56309 EVT VT = N->getValueType(0);
56310 EVT OpVT = LHS.getValueType();
56311 SDLoc DL(N);
56312
56313 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56314 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56315 Subtarget))
56316 return V;
56317 }
56318
56319 if (VT == MVT::i1) {
56320 X86::CondCode X86CC;
56321 if (SDValue V =
56322 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56323 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56324 }
56325
56326 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56327 if (OpVT.isScalarInteger()) {
56328 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56329 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56330 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56331 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56332 if (N0.getOperand(0) == N1)
56333 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56334 N0.getOperand(1));
56335 if (N0.getOperand(1) == N1)
56336 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56337 N0.getOperand(0));
56338 }
56339 return SDValue();
56340 };
56341 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56342 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56343 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56344 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56345
56346 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56347 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56348 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56349 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56350 if (N0.getOperand(0) == N1)
56351 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56352 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56353 if (N0.getOperand(1) == N1)
56354 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56355 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56356 }
56357 return SDValue();
56358 };
56359 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56360 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56361 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56362 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56363
56364 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56365 // cmpne(trunc(x),C) --> cmpne(x,C)
56366 // iff x upper bits are zero.
56367 if (LHS.getOpcode() == ISD::TRUNCATE &&
56368 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56369 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
56370 EVT SrcVT = LHS.getOperand(0).getValueType();
56372 OpVT.getScalarSizeInBits());
56373 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56374 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56375 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56376 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56377 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56378 }
56379
56380 // With C as a power of 2 and C != 0 and C != INT_MIN:
56381 // icmp eq Abs(X) C ->
56382 // (icmp eq A, C) | (icmp eq A, -C)
56383 // icmp ne Abs(X) C ->
56384 // (icmp ne A, C) & (icmp ne A, -C)
56385 // Both of these patterns can be better optimized in
56386 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56387 // integers which is checked above.
56388 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56389 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56390 const APInt &CInt = C->getAPIntValue();
56391 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56392 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56393 SDValue BaseOp = LHS.getOperand(0);
56394 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56395 SDValue SETCC1 = DAG.getSetCC(
56396 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56397 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56398 SETCC0, SETCC1);
56399 }
56400 }
56401 }
56402 }
56403 }
56404
56405 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56406 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56407 // Using temporaries to avoid messing up operand ordering for later
56408 // transformations if this doesn't work.
56409 SDValue Op0 = LHS;
56410 SDValue Op1 = RHS;
56411 ISD::CondCode TmpCC = CC;
56412 // Put build_vector on the right.
56413 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56414 std::swap(Op0, Op1);
56415 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56416 }
56417
56418 bool IsSEXT0 =
56419 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56420 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56421 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56422
56423 if (IsSEXT0 && IsVZero1) {
56424 assert(VT == Op0.getOperand(0).getValueType() &&
56425 "Unexpected operand type");
56426 if (TmpCC == ISD::SETGT)
56427 return DAG.getConstant(0, DL, VT);
56428 if (TmpCC == ISD::SETLE)
56429 return DAG.getConstant(1, DL, VT);
56430 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56431 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56432
56433 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56434 "Unexpected condition code!");
56435 return Op0.getOperand(0);
56436 }
56437
56438 if (IsVZero1)
56439 if (SDValue V =
56440 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56441 return V;
56442 }
56443
56444 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56445 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56446 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56447 // a mask, there are signed AVX512 comparisons).
56448 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56449 bool CanMakeSigned = false;
56450 if (ISD::isUnsignedIntSetCC(CC)) {
56451 KnownBits CmpKnown =
56453 // If we know LHS/RHS share the same sign bit at each element we can
56454 // make this signed.
56455 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56456 // across all lanes. So a pattern where the sign varies from lane to
56457 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56458 // missed. We could get around this by demanding each lane
56459 // independently, but this isn't the most important optimization and
56460 // that may eat into compile time.
56461 CanMakeSigned =
56462 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56463 }
56464 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56465 SDValue LHSOut = LHS;
56466 SDValue RHSOut = RHS;
56467 ISD::CondCode NewCC = CC;
56468 switch (CC) {
56469 case ISD::SETGE:
56470 case ISD::SETUGE:
56471 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56472 /*NSW*/ true))
56473 LHSOut = NewLHS;
56474 else if (SDValue NewRHS = incDecVectorConstant(
56475 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56476 RHSOut = NewRHS;
56477 else
56478 break;
56479
56480 [[fallthrough]];
56481 case ISD::SETUGT:
56482 NewCC = ISD::SETGT;
56483 break;
56484
56485 case ISD::SETLE:
56486 case ISD::SETULE:
56487 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56488 /*NSW*/ true))
56489 LHSOut = NewLHS;
56490 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56491 /*NSW*/ true))
56492 RHSOut = NewRHS;
56493 else
56494 break;
56495
56496 [[fallthrough]];
56497 case ISD::SETULT:
56498 // Will be swapped to SETGT in LowerVSETCC*.
56499 NewCC = ISD::SETLT;
56500 break;
56501 default:
56502 break;
56503 }
56504 if (NewCC != CC) {
56505 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56506 NewCC, DL, DAG, Subtarget))
56507 return R;
56508 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56509 }
56510 }
56511 }
56512
56513 if (SDValue R =
56514 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56515 return R;
56516
56517 // In the middle end transforms:
56518 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56519 // -> `(icmp ult (add x, -C), 2)`
56520 // Likewise inverted cases with `ugt`.
56521 //
56522 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56523 // in worse codegen. So, undo the middle-end transform and go back to `(or
56524 // (icmp eq), (icmp eq))` form.
56525 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56526 // the xmm approach.
56527 //
56528 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56529 // ne))` as it doesn't end up instruction positive.
56530 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56531 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56532 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56533 !Subtarget.hasAVX512() &&
56534 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56535 Subtarget.hasAVX2()) &&
56536 LHS.hasOneUse()) {
56537
56538 APInt CmpC;
56539 SDValue AddC = LHS.getOperand(1);
56540 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56542 // See which form we have depending on the constant/condition.
56543 SDValue C0 = SDValue();
56544 SDValue C1 = SDValue();
56545
56546 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56547 // we will end up generating an additional constant. Keeping in the
56548 // current form has a slight latency cost, but it probably worth saving a
56549 // constant.
56552 // Pass
56553 }
56554 // Normal Cases
56555 else if ((CC == ISD::SETULT && CmpC == 2) ||
56556 (CC == ISD::SETULE && CmpC == 1)) {
56557 // These will constant fold.
56558 C0 = DAG.getNegative(AddC, DL, OpVT);
56559 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56560 DAG.getAllOnesConstant(DL, OpVT));
56561 }
56562 // Inverted Cases
56563 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56564 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56565 // These will constant fold.
56566 C0 = DAG.getNOT(DL, AddC, OpVT);
56567 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56568 DAG.getAllOnesConstant(DL, OpVT));
56569 }
56570 if (C0 && C1) {
56571 SDValue NewLHS =
56572 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56573 SDValue NewRHS =
56574 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56575 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56576 }
56577 }
56578 }
56579
56580 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56581 // to avoid scalarization via legalization because v4i32 is not a legal type.
56582 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56583 LHS.getValueType() == MVT::v4f32)
56584 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56585
56586 // X pred 0.0 --> X pred -X
56587 // If the negation of X already exists, use it in the comparison. This removes
56588 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56589 // instructions in patterns with a 'select' node.
56591 SDVTList FNegVT = DAG.getVTList(OpVT);
56592 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56593 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56594 }
56595
56596 return SDValue();
56597}
56598
56601 const X86Subtarget &Subtarget) {
56602 SDValue Src = N->getOperand(0);
56603 MVT SrcVT = Src.getSimpleValueType();
56604 MVT VT = N->getSimpleValueType(0);
56605 unsigned NumBits = VT.getScalarSizeInBits();
56606 unsigned NumElts = SrcVT.getVectorNumElements();
56607 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56608 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56609
56610 // Perform constant folding.
56611 APInt UndefElts;
56612 SmallVector<APInt, 32> EltBits;
56613 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56614 /*AllowWholeUndefs*/ true,
56615 /*AllowPartialUndefs*/ true)) {
56616 APInt Imm(32, 0);
56617 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56618 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56619 Imm.setBit(Idx);
56620
56621 return DAG.getConstant(Imm, SDLoc(N), VT);
56622 }
56623
56624 // Look through int->fp bitcasts that don't change the element width.
56625 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56626 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56627 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56628 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56629
56630 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56631 // with scalar comparisons.
56632 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56633 SDLoc DL(N);
56634 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56635 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56636 return DAG.getNode(ISD::XOR, DL, VT,
56637 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56638 DAG.getConstant(NotMask, DL, VT));
56639 }
56640
56641 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56642 // results with scalar comparisons.
56643 if (Src.getOpcode() == X86ISD::PCMPGT &&
56644 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56645 SDLoc DL(N);
56646 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56647 return DAG.getNode(ISD::XOR, DL, VT,
56648 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56649 DAG.getConstant(NotMask, DL, VT));
56650 }
56651
56652 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56653 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56654 // iff pow2splat(c1).
56655 // Use KnownBits to determine if only a single bit is non-zero
56656 // in each element (pow2 or zero), and shift that bit to the msb.
56657 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56658 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56659 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56660 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56661 if (KnownLHS.countMaxPopulation() == 1 &&
56662 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56663 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56664 SDLoc DL(N);
56665 MVT ShiftVT = SrcVT;
56666 SDValue ShiftLHS = Src.getOperand(0);
56667 SDValue ShiftRHS = Src.getOperand(1);
56668 if (ShiftVT.getScalarType() == MVT::i8) {
56669 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56670 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56671 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56672 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56673 }
56674 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56675 ShiftLHS, ShiftAmt, DAG);
56676 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56677 ShiftRHS, ShiftAmt, DAG);
56678 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56679 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56680 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56681 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56682 }
56683 }
56684
56685 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56686 if (N->isOnlyUserOf(Src.getNode())) {
56688 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56689 APInt UndefElts;
56690 SmallVector<APInt, 32> EltBits;
56691 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56692 UndefElts, EltBits)) {
56693 APInt Mask = APInt::getZero(NumBits);
56694 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56695 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56696 Mask.setBit(Idx);
56697 }
56698 SDLoc DL(N);
56699 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56700 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56701 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56702 DAG.getConstant(Mask, DL, VT));
56703 }
56704 }
56705 }
56706
56707 // Simplify the inputs.
56708 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56709 APInt DemandedMask(APInt::getAllOnes(NumBits));
56710 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56711 return SDValue(N, 0);
56712
56713 return SDValue();
56714}
56715
56718 const X86Subtarget &Subtarget) {
56719 MVT VT = N->getSimpleValueType(0);
56720 unsigned NumBits = VT.getScalarSizeInBits();
56721
56722 // Simplify the inputs.
56723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56724 APInt DemandedMask(APInt::getAllOnes(NumBits));
56725 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56726 return SDValue(N, 0);
56727
56728 return SDValue();
56729}
56730
56733 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
56734 SDValue Mask = MemOp->getMask();
56735
56736 // With vector masks we only demand the upper bit of the mask.
56737 if (Mask.getScalarValueSizeInBits() != 1) {
56738 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56739 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56740 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56741 if (N->getOpcode() != ISD::DELETED_NODE)
56742 DCI.AddToWorklist(N);
56743 return SDValue(N, 0);
56744 }
56745 }
56746
56747 return SDValue();
56748}
56749
56751 SDValue Index, SDValue Base, SDValue Scale,
56752 SelectionDAG &DAG) {
56753 SDLoc DL(GorS);
56754
56755 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56756 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56757 Gather->getMask(), Base, Index, Scale } ;
56758 return DAG.getMaskedGather(Gather->getVTList(),
56759 Gather->getMemoryVT(), DL, Ops,
56760 Gather->getMemOperand(),
56761 Gather->getIndexType(),
56762 Gather->getExtensionType());
56763 }
56764 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56765 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56766 Scatter->getMask(), Base, Index, Scale };
56767 return DAG.getMaskedScatter(Scatter->getVTList(),
56768 Scatter->getMemoryVT(), DL,
56769 Ops, Scatter->getMemOperand(),
56770 Scatter->getIndexType(),
56771 Scatter->isTruncatingStore());
56772}
56773
56776 SDLoc DL(N);
56777 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56778 SDValue Index = GorS->getIndex();
56779 SDValue Base = GorS->getBasePtr();
56780 SDValue Scale = GorS->getScale();
56781 EVT IndexVT = Index.getValueType();
56782 EVT IndexSVT = IndexVT.getVectorElementType();
56783 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56784 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56785 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56786
56787 if (DCI.isBeforeLegalize()) {
56788 // Attempt to move shifted index into the address scale, allows further
56789 // index truncation below.
56790 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56791 isa<ConstantSDNode>(Scale)) {
56792 unsigned ScaleAmt = Scale->getAsZExtVal();
56793 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56794 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56795 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56796 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56797 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56798 if (N->getOpcode() != ISD::DELETED_NODE)
56799 DCI.AddToWorklist(N);
56800 return SDValue(N, 0);
56801 }
56802 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56803 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56804 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56805 SDValue ShAmt = Index.getOperand(1);
56806 SDValue NewShAmt =
56807 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56808 DAG.getConstant(1, DL, ShAmt.getValueType()));
56809 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56810 Index.getOperand(0), NewShAmt);
56811 SDValue NewScale =
56812 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56813 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56814 }
56815 }
56816 }
56817
56818 // Shrink indices if they are larger than 32-bits.
56819 // Only do this before legalize types since v2i64 could become v2i32.
56820 // FIXME: We could check that the type is legal if we're after legalize
56821 // types, but then we would need to construct test cases where that happens.
56822 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56823 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56824
56825 // FIXME: We could support more than just constant fold, but we need to
56826 // careful with costing. A truncate that can be optimized out would be
56827 // fine. Otherwise we might only want to create a truncate if it avoids
56828 // a split.
56829 if (SDValue TruncIndex =
56830 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56831 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56832
56833 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56834 // there are sufficient sign bits. Only do this before legalize types to
56835 // avoid creating illegal types in truncate.
56836 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56837 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56838 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56839 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56840 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56841 }
56842
56843 // Shrink if we remove an illegal type.
56844 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56845 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56846 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56847 }
56848 }
56849 }
56850
56851 // Try to move splat adders from the index operand to the base
56852 // pointer operand. Taking care to multiply by the scale. We can only do
56853 // this when index element type is the same as the pointer type.
56854 // Otherwise we need to be sure the math doesn't wrap before the scale.
56855 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56856 isa<ConstantSDNode>(Scale)) {
56857 uint64_t ScaleAmt = Scale->getAsZExtVal();
56858
56859 for (unsigned I = 0; I != 2; ++I)
56860 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56861 BitVector UndefElts;
56862 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56863 if (UndefElts.none()) {
56864 // If the splat value is constant we can add the scaled splat value
56865 // to the existing base.
56866 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56867 APInt Adder = C->getAPIntValue() * ScaleAmt;
56868 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56869 DAG.getConstant(Adder, DL, PtrVT));
56870 SDValue NewIndex = Index.getOperand(1 - I);
56871 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56872 }
56873 // For non-constant cases, limit this to non-scaled cases.
56874 if (ScaleAmt == 1) {
56875 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56876 SDValue NewIndex = Index.getOperand(1 - I);
56877 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56878 }
56879 }
56880 }
56881 // It's also possible base is just a constant. In that case, just
56882 // replace it with 0 and move the displacement into the index.
56883 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56884 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56885 // Combine the constant build_vector and the constant base.
56886 Splat =
56887 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56888 // Add to the other half of the original Index add.
56889 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56890 Index.getOperand(1 - I), Splat);
56891 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56892 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56893 }
56894 }
56895 }
56896
56897 if (DCI.isBeforeLegalizeOps()) {
56898 // Make sure the index is either i32 or i64
56899 if (IndexWidth != 32 && IndexWidth != 64) {
56900 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56901 IndexVT = IndexVT.changeVectorElementType(EltVT);
56902 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56903 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56904 }
56905 }
56906
56907 // With vector masks we only demand the upper bit of the mask.
56908 SDValue Mask = GorS->getMask();
56909 if (Mask.getScalarValueSizeInBits() != 1) {
56910 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56911 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56912 if (N->getOpcode() != ISD::DELETED_NODE)
56913 DCI.AddToWorklist(N);
56914 return SDValue(N, 0);
56915 }
56916 }
56917
56918 return SDValue();
56919}
56920
56921// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
56923 const X86Subtarget &Subtarget) {
56924 SDLoc DL(N);
56925 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56926 SDValue EFLAGS = N->getOperand(1);
56927
56928 // Try to simplify the EFLAGS and condition code operands.
56929 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
56930 return getSETCC(CC, Flags, DL, DAG);
56931
56932 return SDValue();
56933}
56934
56935/// Optimize branch condition evaluation.
56937 const X86Subtarget &Subtarget) {
56938 SDLoc DL(N);
56939 SDValue EFLAGS = N->getOperand(3);
56940 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56941
56942 // Try to simplify the EFLAGS and condition code operands.
56943 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
56944 // RAUW them under us.
56945 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
56946 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
56947 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56948 N->getOperand(1), Cond, Flags);
56949 }
56950
56951 return SDValue();
56952}
56953
56954// TODO: Could we move this to DAGCombine?
56956 SelectionDAG &DAG) {
56957 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56958 // to optimize away operation when it's from a constant.
56959 //
56960 // The general transformation is:
56961 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56962 // AND(VECTOR_CMP(x,y), constant2)
56963 // constant2 = UNARYOP(constant)
56964
56965 // Early exit if this isn't a vector operation, the operand of the
56966 // unary operation isn't a bitwise AND, or if the sizes of the operations
56967 // aren't the same.
56968 EVT VT = N->getValueType(0);
56969 bool IsStrict = N->isStrictFPOpcode();
56970 unsigned NumEltBits = VT.getScalarSizeInBits();
56971 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56972 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
56973 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
56974 VT.getSizeInBits() != Op0.getValueSizeInBits())
56975 return SDValue();
56976
56977 // Now check that the other operand of the AND is a constant. We could
56978 // make the transformation for non-constant splats as well, but it's unclear
56979 // that would be a benefit as it would not eliminate any operations, just
56980 // perform one more step in scalar code before moving to the vector unit.
56981 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
56982 // Bail out if the vector isn't a constant.
56983 if (!BV->isConstant())
56984 return SDValue();
56985
56986 // Everything checks out. Build up the new and improved node.
56987 SDLoc DL(N);
56988 EVT IntVT = BV->getValueType(0);
56989 // Create a new constant of the appropriate type for the transformed
56990 // DAG.
56991 SDValue SourceConst;
56992 if (IsStrict)
56993 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
56994 {N->getOperand(0), SDValue(BV, 0)});
56995 else
56996 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
56997 // The AND node needs bitcasts to/from an integer vector type around it.
56998 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
56999 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57000 MaskConst);
57001 SDValue Res = DAG.getBitcast(VT, NewAnd);
57002 if (IsStrict)
57003 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57004 return Res;
57005 }
57006
57007 return SDValue();
57008}
57009
57010/// If we are converting a value to floating-point, try to replace scalar
57011/// truncate of an extracted vector element with a bitcast. This tries to keep
57012/// the sequence on XMM registers rather than moving between vector and GPRs.
57014 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57015 // to allow being called by any similar cast opcode.
57016 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57017 SDValue Trunc = N->getOperand(0);
57018 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57019 return SDValue();
57020
57021 SDValue ExtElt = Trunc.getOperand(0);
57022 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57023 !isNullConstant(ExtElt.getOperand(1)))
57024 return SDValue();
57025
57026 EVT TruncVT = Trunc.getValueType();
57027 EVT SrcVT = ExtElt.getValueType();
57028 unsigned DestWidth = TruncVT.getSizeInBits();
57029 unsigned SrcWidth = SrcVT.getSizeInBits();
57030 if (SrcWidth % DestWidth != 0)
57031 return SDValue();
57032
57033 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57034 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57035 unsigned VecWidth = SrcVecVT.getSizeInBits();
57036 unsigned NumElts = VecWidth / DestWidth;
57037 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57038 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57039 SDLoc DL(N);
57040 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57041 BitcastVec, ExtElt.getOperand(1));
57042 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57043}
57044
57046 const X86Subtarget &Subtarget) {
57047 bool IsStrict = N->isStrictFPOpcode();
57048 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57049 EVT VT = N->getValueType(0);
57050 EVT InVT = Op0.getValueType();
57051
57052 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57053 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57054 // if hasFP16 support:
57055 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57056 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57057 // else
57058 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57059 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57060 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57061 unsigned ScalarSize = InVT.getScalarSizeInBits();
57062 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57063 ScalarSize >= 64)
57064 return SDValue();
57065 SDLoc dl(N);
57066 EVT DstVT =
57068 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57069 : ScalarSize < 32 ? MVT::i32
57070 : MVT::i64,
57071 InVT.getVectorNumElements());
57072 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57073 if (IsStrict)
57074 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57075 {N->getOperand(0), P});
57076 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57077 }
57078
57079 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57080 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57081 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57082 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57083 VT.getScalarType() != MVT::f16) {
57084 SDLoc dl(N);
57085 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57086 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57087
57088 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57089 if (IsStrict)
57090 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57091 {N->getOperand(0), P});
57092 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57093 }
57094
57095 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57096 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57097 // the optimization here.
57098 SDNodeFlags Flags = N->getFlags();
57099 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57100 if (IsStrict)
57101 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57102 {N->getOperand(0), Op0});
57103 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57104 }
57105
57106 return SDValue();
57107}
57108
57111 const X86Subtarget &Subtarget) {
57112 // First try to optimize away the conversion entirely when it's
57113 // conditionally from a constant. Vectors only.
57114 bool IsStrict = N->isStrictFPOpcode();
57116 return Res;
57117
57118 // Now move on to more general possibilities.
57119 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57120 EVT VT = N->getValueType(0);
57121 EVT InVT = Op0.getValueType();
57122
57123 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57124 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57125 // if hasFP16 support:
57126 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57127 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57128 // else
57129 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57130 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57131 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57132 unsigned ScalarSize = InVT.getScalarSizeInBits();
57133 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57134 ScalarSize >= 64)
57135 return SDValue();
57136 SDLoc dl(N);
57137 EVT DstVT =
57139 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57140 : ScalarSize < 32 ? MVT::i32
57141 : MVT::i64,
57142 InVT.getVectorNumElements());
57143 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57144 if (IsStrict)
57145 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57146 {N->getOperand(0), P});
57147 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57148 }
57149
57150 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57151 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57152 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57153 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57154 VT.getScalarType() != MVT::f16) {
57155 SDLoc dl(N);
57156 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57157 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57158 if (IsStrict)
57159 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57160 {N->getOperand(0), P});
57161 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57162 }
57163
57164 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57165 // vectors and scalars, see if we know that the upper bits are all the sign
57166 // bit, in which case we can truncate the input to i32 and convert from that.
57167 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57168 unsigned BitWidth = InVT.getScalarSizeInBits();
57169 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57170 if (NumSignBits >= (BitWidth - 31)) {
57171 EVT TruncVT = MVT::i32;
57172 if (InVT.isVector())
57173 TruncVT = InVT.changeVectorElementType(TruncVT);
57174 SDLoc dl(N);
57175 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57176 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57177 if (IsStrict)
57178 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57179 {N->getOperand(0), Trunc});
57180 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57181 }
57182 // If we're after legalize and the type is v2i32 we need to shuffle and
57183 // use CVTSI2P.
57184 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57185 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57186 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57187 { 0, 2, -1, -1 });
57188 if (IsStrict)
57189 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57190 {N->getOperand(0), Shuf});
57191 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57192 }
57193 }
57194
57195 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57196 // a 32-bit target where SSE doesn't support i64->FP operations.
57197 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57198 Op0.getOpcode() == ISD::LOAD) {
57199 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57200
57201 // This transformation is not supported if the result type is f16 or f128.
57202 if (VT == MVT::f16 || VT == MVT::f128)
57203 return SDValue();
57204
57205 // If we have AVX512DQ we can use packed conversion instructions unless
57206 // the VT is f80.
57207 if (Subtarget.hasDQI() && VT != MVT::f80)
57208 return SDValue();
57209
57210 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57211 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57212 std::pair<SDValue, SDValue> Tmp =
57213 Subtarget.getTargetLowering()->BuildFILD(
57214 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57215 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57216 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57217 return Tmp.first;
57218 }
57219 }
57220
57221 if (IsStrict)
57222 return SDValue();
57223
57224 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57225 return V;
57226
57227 return SDValue();
57228}
57229
57231 const X86Subtarget &Subtarget) {
57232 EVT VT = N->getValueType(0);
57233 SDValue Src = N->getOperand(0);
57234 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57235 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57236 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57237
57238 return SDValue();
57239}
57240
57241// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57243 const X86Subtarget &Subtarget) {
57244 if (!Subtarget.hasAVX10_2())
57245 return SDValue();
57246
57247 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57248 EVT SrcVT = N->getOperand(0).getValueType();
57249 EVT DstVT = N->getValueType(0);
57250 SDLoc dl(N);
57251
57252 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57253 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57254
57255 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57256 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57257 N->getOperand(0), V2F32Value);
57258
57259 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57260 if (IsSigned)
57261 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57262
57263 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57264 }
57265 return SDValue();
57266}
57267
57269 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57270
57271 for (const SDNode *User : Flags->users()) {
57272 X86::CondCode CC;
57273 switch (User->getOpcode()) {
57274 default:
57275 // Be conservative.
57276 return true;
57277 case X86ISD::SETCC:
57279 CC = (X86::CondCode)User->getConstantOperandVal(0);
57280 break;
57281 case X86ISD::BRCOND:
57282 case X86ISD::CMOV:
57283 CC = (X86::CondCode)User->getConstantOperandVal(2);
57284 break;
57285 }
57286
57287 switch (CC) {
57288 // clang-format off
57289 default: break;
57290 case X86::COND_A: case X86::COND_AE:
57291 case X86::COND_B: case X86::COND_BE:
57292 case X86::COND_O: case X86::COND_NO:
57293 case X86::COND_G: case X86::COND_GE:
57294 case X86::COND_L: case X86::COND_LE:
57295 return true;
57296 // clang-format on
57297 }
57298 }
57299
57300 return false;
57301}
57302
57303static bool onlyZeroFlagUsed(SDValue Flags) {
57304 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57305
57306 for (const SDNode *User : Flags->users()) {
57307 unsigned CCOpNo;
57308 switch (User->getOpcode()) {
57309 default:
57310 // Be conservative.
57311 return false;
57312 case X86ISD::SETCC:
57314 CCOpNo = 0;
57315 break;
57316 case X86ISD::BRCOND:
57317 case X86ISD::CMOV:
57318 CCOpNo = 2;
57319 break;
57320 }
57321
57322 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57323 if (CC != X86::COND_E && CC != X86::COND_NE)
57324 return false;
57325 }
57326
57327 return true;
57328}
57329
57332 const X86Subtarget &Subtarget) {
57333 // Only handle test patterns.
57334 if (!isNullConstant(N->getOperand(1)))
57335 return SDValue();
57336
57337 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57338 // and use its flags directly.
57339 // TODO: Maybe we should try promoting compares that only use the zero flag
57340 // first if we can prove the upper bits with computeKnownBits?
57341 SDLoc dl(N);
57342 SDValue Op = N->getOperand(0);
57343 EVT VT = Op.getValueType();
57344 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57345
57346 if (SDValue CMP =
57347 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57348 return CMP;
57349
57350 // If we have a constant logical shift that's only used in a comparison
57351 // against zero turn it into an equivalent AND. This allows turning it into
57352 // a TEST instruction later.
57353 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57354 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57355 onlyZeroFlagUsed(SDValue(N, 0))) {
57356 unsigned BitWidth = VT.getSizeInBits();
57357 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57358 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57359 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57360 APInt Mask = Op.getOpcode() == ISD::SRL
57361 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57362 : APInt::getLowBitsSet(BitWidth, MaskBits);
57363 if (Mask.isSignedIntN(32)) {
57364 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57365 DAG.getConstant(Mask, dl, VT));
57366 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57367 DAG.getConstant(0, dl, VT));
57368 }
57369 }
57370 }
57371
57372 // If we're extracting from a avx512 bool vector and comparing against zero,
57373 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57374 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57375 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57376 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57377 SDValue Src = Op.getOperand(0);
57378 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57379 isNullConstant(Src.getOperand(1)) &&
57380 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57381 SDValue BoolVec = Src.getOperand(0);
57382 unsigned ShAmt = 0;
57383 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57384 ShAmt = BoolVec.getConstantOperandVal(1);
57385 BoolVec = BoolVec.getOperand(0);
57386 }
57387 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57388 EVT VecVT = BoolVec.getValueType();
57389 unsigned BitWidth = VecVT.getVectorNumElements();
57390 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57391 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57392 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57393 Op = DAG.getBitcast(BCVT, BoolVec);
57394 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57395 DAG.getConstant(Mask, dl, BCVT));
57396 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57397 DAG.getConstant(0, dl, BCVT));
57398 }
57399 }
57400 }
57401
57402 // Peek through any zero-extend if we're only testing for a zero result.
57403 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57404 SDValue Src = Op.getOperand(0);
57405 EVT SrcVT = Src.getValueType();
57406 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57407 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57408 DAG.getConstant(0, dl, SrcVT));
57409 }
57410
57411 // Look for a truncate.
57412 if (Op.getOpcode() != ISD::TRUNCATE)
57413 return SDValue();
57414
57415 SDValue Trunc = Op;
57416 Op = Op.getOperand(0);
57417
57418 // See if we can compare with zero against the truncation source,
57419 // which should help using the Z flag from many ops. Only do this for
57420 // i32 truncated op to prevent partial-reg compares of promoted ops.
57421 EVT OpVT = Op.getValueType();
57422 APInt UpperBits =
57424 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57425 onlyZeroFlagUsed(SDValue(N, 0))) {
57426 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57427 DAG.getConstant(0, dl, OpVT));
57428 }
57429
57430 // After this the truncate and arithmetic op must have a single use.
57431 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57432 return SDValue();
57433
57434 unsigned NewOpc;
57435 switch (Op.getOpcode()) {
57436 default: return SDValue();
57437 case ISD::AND:
57438 // Skip and with constant. We have special handling for and with immediate
57439 // during isel to generate test instructions.
57440 if (isa<ConstantSDNode>(Op.getOperand(1)))
57441 return SDValue();
57442 NewOpc = X86ISD::AND;
57443 break;
57444 case ISD::OR: NewOpc = X86ISD::OR; break;
57445 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57446 case ISD::ADD:
57447 // If the carry or overflow flag is used, we can't truncate.
57449 return SDValue();
57450 NewOpc = X86ISD::ADD;
57451 break;
57452 case ISD::SUB:
57453 // If the carry or overflow flag is used, we can't truncate.
57455 return SDValue();
57456 NewOpc = X86ISD::SUB;
57457 break;
57458 }
57459
57460 // We found an op we can narrow. Truncate its inputs.
57461 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57462 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57463
57464 // Use a X86 specific opcode to avoid DAG combine messing with it.
57465 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57466 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57467
57468 // For AND, keep a CMP so that we can match the test pattern.
57469 if (NewOpc == X86ISD::AND)
57470 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57471 DAG.getConstant(0, dl, VT));
57472
57473 // Return the flags.
57474 return Op.getValue(1);
57475}
57476
57479 const X86Subtarget &ST) {
57480 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57481 "Expected X86ISD::ADD or X86ISD::SUB");
57482
57483 SDLoc DL(N);
57484 SDValue LHS = N->getOperand(0);
57485 SDValue RHS = N->getOperand(1);
57486 MVT VT = LHS.getSimpleValueType();
57487 bool IsSub = X86ISD::SUB == N->getOpcode();
57488 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57489
57490 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57491 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57492 return CMP;
57493
57494 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57495 if (!N->hasAnyUseOfValue(1)) {
57496 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57497 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57498 }
57499
57500 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57501 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57502 SDValue Ops[] = {N0, N1};
57503 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57504 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57505 SDValue Op(N, 0);
57506 if (Negate) {
57507 // Bail if this is only used by a user of the x86 add/sub.
57508 if (GenericAddSub->hasOneUse() &&
57509 GenericAddSub->user_begin()->isOnlyUserOf(N))
57510 return;
57511 Op = DAG.getNegative(Op, DL, VT);
57512 }
57513 DCI.CombineTo(GenericAddSub, Op);
57514 }
57515 };
57516 MatchGeneric(LHS, RHS, false);
57517 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57518
57519 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57520 // EFLAGS result doesn't change.
57521 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57522 /*ZeroSecondOpOnly*/ true);
57523}
57524
57526 SDValue LHS = N->getOperand(0);
57527 SDValue RHS = N->getOperand(1);
57528 SDValue BorrowIn = N->getOperand(2);
57529
57530 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57531 MVT VT = N->getSimpleValueType(0);
57532 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57533 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57534 }
57535
57536 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57537 // iff the flag result is dead.
57538 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57539 !N->hasAnyUseOfValue(1))
57540 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57541 LHS.getOperand(1), BorrowIn);
57542
57543 return SDValue();
57544}
57545
57546// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57549 SDValue LHS = N->getOperand(0);
57550 SDValue RHS = N->getOperand(1);
57551 SDValue CarryIn = N->getOperand(2);
57552 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57553 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57554
57555 // Canonicalize constant to RHS.
57556 if (LHSC && !RHSC)
57557 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57558 CarryIn);
57559
57560 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57561 // the result is either zero or one (depending on the input carry bit).
57562 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57563 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57564 // We don't have a good way to replace an EFLAGS use, so only do this when
57565 // dead right now.
57566 SDValue(N, 1).use_empty()) {
57567 SDLoc DL(N);
57568 EVT VT = N->getValueType(0);
57569 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57570 SDValue Res1 = DAG.getNode(
57571 ISD::AND, DL, VT,
57573 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57574 DAG.getConstant(1, DL, VT));
57575 return DCI.CombineTo(N, Res1, CarryOut);
57576 }
57577
57578 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57579 // iff the flag result is dead.
57580 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57581 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57582 SDLoc DL(N);
57583 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57584 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57585 DAG.getConstant(0, DL, LHS.getValueType()),
57586 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57587 }
57588
57589 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57590 MVT VT = N->getSimpleValueType(0);
57591 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57592 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57593 }
57594
57595 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57596 // iff the flag result is dead.
57597 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57598 !N->hasAnyUseOfValue(1))
57599 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57600 LHS.getOperand(1), CarryIn);
57601
57602 return SDValue();
57603}
57604
57606 const SDLoc &DL, EVT VT,
57607 const X86Subtarget &Subtarget) {
57608 using namespace SDPatternMatch;
57609
57610 // Example of pattern we try to detect:
57611 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57612 //(add (build_vector (extract_elt t, 0),
57613 // (extract_elt t, 2),
57614 // (extract_elt t, 4),
57615 // (extract_elt t, 6)),
57616 // (build_vector (extract_elt t, 1),
57617 // (extract_elt t, 3),
57618 // (extract_elt t, 5),
57619 // (extract_elt t, 7)))
57620
57621 if (!Subtarget.hasSSE2())
57622 return SDValue();
57623
57624 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57625 VT.getVectorNumElements() < 4 ||
57627 return SDValue();
57628
57629 SDValue Op0, Op1, Accum;
57630 if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
57631 m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&
57632 !sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
57633 m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),
57634 m_Value(Op1))))))
57635 return SDValue();
57636
57637 // Check if one of Op0,Op1 is of the form:
57638 // (build_vector (extract_elt Mul, 0),
57639 // (extract_elt Mul, 2),
57640 // (extract_elt Mul, 4),
57641 // ...
57642 // the other is of the form:
57643 // (build_vector (extract_elt Mul, 1),
57644 // (extract_elt Mul, 3),
57645 // (extract_elt Mul, 5),
57646 // ...
57647 // and identify Mul.
57648 SDValue Mul;
57649 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57650 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57651 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57652 // TODO: Be more tolerant to undefs.
57653 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57654 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57655 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57656 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57657 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57658 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57659 return SDValue();
57660 // Commutativity of mul allows factors of a product to reorder.
57661 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57662 std::swap(Idx0L, Idx1L);
57663 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57664 std::swap(Idx0H, Idx1H);
57665 // Commutativity of add allows pairs of factors to reorder.
57666 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57667 std::swap(Idx0L, Idx0H);
57668 std::swap(Idx1L, Idx1H);
57669 }
57670 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57671 Idx1H != 2 * i + 3)
57672 return SDValue();
57673 if (!Mul) {
57674 // First time an extract_elt's source vector is visited. Must be a MUL
57675 // with 2X number of vector elements than the BUILD_VECTOR.
57676 // Both extracts must be from same MUL.
57677 Mul = Vec0L;
57678 if (Mul.getOpcode() != ISD::MUL ||
57679 Mul.getValueType().getVectorNumElements() != 2 * e)
57680 return SDValue();
57681 }
57682 // Check that the extract is from the same MUL previously seen.
57683 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57684 return SDValue();
57685 }
57686
57687 // Check if the Mul source can be safely shrunk.
57688 ShrinkMode Mode;
57689 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57690 Mode == ShrinkMode::MULU16)
57691 return SDValue();
57692
57693 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57694 VT.getVectorNumElements() * 2);
57695 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57696 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57697
57698 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57699 ArrayRef<SDValue> Ops) {
57700 EVT InVT = Ops[0].getValueType();
57701 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57702 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57703 InVT.getVectorNumElements() / 2);
57704 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57705 };
57706 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57707 if (Accum)
57708 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57709 return R;
57710}
57711
57712// Attempt to turn this pattern into PMADDWD.
57713// (add (mul (sext (build_vector)), (sext (build_vector))),
57714// (mul (sext (build_vector)), (sext (build_vector)))
57716 const SDLoc &DL, EVT VT,
57717 const X86Subtarget &Subtarget) {
57718 using namespace SDPatternMatch;
57719
57720 if (!Subtarget.hasSSE2())
57721 return SDValue();
57722
57723 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57724 VT.getVectorNumElements() < 4 ||
57726 return SDValue();
57727
57728 // All inputs need to be sign extends.
57729 // TODO: Support ZERO_EXTEND from known positive?
57730 SDValue N00, N01, N10, N11;
57731 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57732 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57733 return SDValue();
57734
57735 // Must be extending from vXi16.
57736 EVT InVT = N00.getValueType();
57737 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57738 N10.getValueType() != InVT || N11.getValueType() != InVT)
57739 return SDValue();
57740
57741 // All inputs should be build_vectors.
57742 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57743 N01.getOpcode() != ISD::BUILD_VECTOR ||
57744 N10.getOpcode() != ISD::BUILD_VECTOR ||
57746 return SDValue();
57747
57748 // For each element, we need to ensure we have an odd element from one vector
57749 // multiplied by the odd element of another vector and the even element from
57750 // one of the same vectors being multiplied by the even element from the
57751 // other vector. So we need to make sure for each element i, this operator
57752 // is being performed:
57753 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57754 SDValue In0, In1;
57755 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57756 SDValue N00Elt = N00.getOperand(i);
57757 SDValue N01Elt = N01.getOperand(i);
57758 SDValue N10Elt = N10.getOperand(i);
57759 SDValue N11Elt = N11.getOperand(i);
57760 // TODO: Be more tolerant to undefs.
57761 SDValue N00In, N01In, N10In, N11In;
57762 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57763 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57764 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57765 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57766 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57767 return SDValue();
57768 // Add is commutative so indices can be reordered.
57769 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57770 std::swap(IdxN00, IdxN10);
57771 std::swap(IdxN01, IdxN11);
57772 }
57773 // N0 indices be the even element. N1 indices must be the next odd element.
57774 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57775 IdxN11 != 2 * i + 1)
57776 return SDValue();
57777
57778 // First time we find an input capture it.
57779 if (!In0) {
57780 In0 = N00In;
57781 In1 = N01In;
57782
57783 // The input vectors must be at least as wide as the output.
57784 // If they are larger than the output, we extract subvector below.
57785 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57786 In1.getValueSizeInBits() < VT.getSizeInBits())
57787 return SDValue();
57788 }
57789 // Mul is commutative so the input vectors can be in any order.
57790 // Canonicalize to make the compares easier.
57791 if (In0 != N00In)
57792 std::swap(N00In, N01In);
57793 if (In0 != N10In)
57794 std::swap(N10In, N11In);
57795 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57796 return SDValue();
57797 }
57798
57799 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57800 ArrayRef<SDValue> Ops) {
57801 EVT OpVT = Ops[0].getValueType();
57802 assert(OpVT.getScalarType() == MVT::i16 &&
57803 "Unexpected scalar element type");
57804 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57805 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57806 OpVT.getVectorNumElements() / 2);
57807 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57808 };
57809
57810 // If the output is narrower than an input, extract the low part of the input
57811 // vector.
57812 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57813 VT.getVectorNumElements() * 2);
57814 if (OutVT16.bitsLT(In0.getValueType())) {
57815 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57816 DAG.getVectorIdxConstant(0, DL));
57817 }
57818 if (OutVT16.bitsLT(In1.getValueType())) {
57819 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57820 DAG.getVectorIdxConstant(0, DL));
57821 }
57822 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57823 PMADDBuilder);
57824}
57825
57826// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57827// If upper element in each pair of both VPMADDWD are zero then we can merge
57828// the operand elements and use the implicit add of VPMADDWD.
57829// TODO: Add support for VPMADDUBSW (which isn't commutable).
57831 const SDLoc &DL, EVT VT) {
57832 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57833 return SDValue();
57834
57835 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57836 if (VT.getSizeInBits() > 128)
57837 return SDValue();
57838
57839 unsigned NumElts = VT.getVectorNumElements();
57840 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57842 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57843
57844 bool Op0HiZero =
57845 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57846 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57847 bool Op1HiZero =
57848 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57849 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57850
57851 // TODO: Check for zero lower elements once we have actual codegen that
57852 // creates them.
57853 if (!Op0HiZero || !Op1HiZero)
57854 return SDValue();
57855
57856 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57857 SmallVector<int> Mask;
57858 for (int i = 0; i != (int)NumElts; ++i) {
57859 Mask.push_back(2 * i);
57860 Mask.push_back(2 * (i + NumElts));
57861 }
57862
57863 SDValue LHS =
57864 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57865 SDValue RHS =
57866 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57867 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57868}
57869
57870/// CMOV of constants requires materializing constant operands in registers.
57871/// Try to fold those constants into an 'add' instruction to reduce instruction
57872/// count. We do this with CMOV rather the generic 'select' because there are
57873/// earlier folds that may be used to turn select-of-constants into logic hacks.
57875 SelectionDAG &DAG,
57876 const X86Subtarget &Subtarget) {
57877 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57878 // better because we eliminate 1-2 instructions. This transform is still
57879 // an improvement without zero operands because we trade 2 move constants and
57880 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57881 // immediate asm operands (fit in 32-bits).
57882 auto isSuitableCmov = [](SDValue V) {
57883 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57884 return false;
57885 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57886 !isa<ConstantSDNode>(V.getOperand(1)))
57887 return false;
57888 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57889 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57890 V.getConstantOperandAPInt(1).isSignedIntN(32));
57891 };
57892
57893 // Match an appropriate CMOV as the first operand of the add.
57894 SDValue Cmov = N->getOperand(0);
57895 SDValue OtherOp = N->getOperand(1);
57896 if (!isSuitableCmov(Cmov))
57897 std::swap(Cmov, OtherOp);
57898 if (!isSuitableCmov(Cmov))
57899 return SDValue();
57900
57901 // Don't remove a load folding opportunity for the add. That would neutralize
57902 // any improvements from removing constant materializations.
57903 if (X86::mayFoldLoad(OtherOp, Subtarget))
57904 return SDValue();
57905
57906 EVT VT = N->getValueType(0);
57907 SDValue FalseOp = Cmov.getOperand(0);
57908 SDValue TrueOp = Cmov.getOperand(1);
57909
57910 // We will push the add through the select, but we can potentially do better
57911 // if we know there is another add in the sequence and this is pointer math.
57912 // In that case, we can absorb an add into the trailing memory op and avoid
57913 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57914 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57915 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57916 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57917 all_of(N->users(), [&](SDNode *Use) {
57918 auto *MemNode = dyn_cast<MemSDNode>(Use);
57919 return MemNode && MemNode->getBasePtr().getNode() == N;
57920 })) {
57921 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57922 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
57923 // it is possible that choosing op1 might be better.
57924 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
57925 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
57926 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
57927 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
57928 Cmov.getOperand(2), Cmov.getOperand(3));
57929 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
57930 }
57931
57932 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57933 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
57934 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
57935 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
57936 Cmov.getOperand(3));
57937}
57938
57941 const X86Subtarget &Subtarget) {
57942 using namespace SDPatternMatch;
57943 EVT VT = N->getValueType(0);
57944 SDValue Op0 = N->getOperand(0);
57945 SDValue Op1 = N->getOperand(1);
57946 SDLoc DL(N);
57947
57948 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
57949 return Select;
57950
57951 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
57952 return MAdd;
57953 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
57954 return MAdd;
57955 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
57956 return MAdd;
57957
57958 // Try to synthesize horizontal adds from adds of shuffles.
57959 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57960 return V;
57961
57962 // Canonicalize hidden LEA pattern:
57963 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
57964 // iff c < 4
57965 if (VT == MVT::i32 || VT == MVT::i64) {
57966 SDValue Y, Z, Shift;
57967 APInt Amt;
57968 if (sd_match(
57969 N, m_Add(m_OneUse(m_Sub(m_AllOf(m_Value(Shift),
57970 m_Shl(m_Value(), m_ConstInt(Amt))),
57971 m_Value(Y))),
57972 m_Value(Z))) &&
57973 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
57974 return DAG.getNode(ISD::SUB, DL, VT,
57975 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
57976 }
57977 }
57978
57979 SDValue X, Y;
57980
57981 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
57982 // iff X and Y won't overflow.
57983 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
57984 sd_match(Op1, m_c_BinOp(X86ISD::PSADBW, m_Value(Y), m_Zero())) &&
57985 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
57986 MVT OpVT = X.getSimpleValueType();
57987 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
57988 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
57989 getZeroVector(OpVT, Subtarget, DAG, DL));
57990 }
57991
57992 if (VT.isVector()) {
57993 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
57995
57996 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
57997 // (sub Y, (sext (vXi1 X))).
57998 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
57999 // in generic DAG combine without a legal type check, but adding this there
58000 // caused regressions.
58001 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58002 sd_match(N, m_Add(m_ZExt(m_AllOf(m_SpecificVT(BoolVT), m_Value(X))),
58003 m_Value(Y)))) {
58004 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58005 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58006 }
58007
58008 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58009 // canonicalisation as we don't have good vXi8 shifts.
58010 if (VT.getScalarType() == MVT::i8 &&
58011 sd_match(N, m_Add(m_Value(X), m_Srl(m_Value(Y), m_SpecificInt(7))))) {
58012 SDValue Cmp =
58013 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58014 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58015 }
58016 }
58017
58018 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58019 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58020 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58021 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58022 if (sd_match(N, m_Add(m_Value(Accum),
58023 m_Node(ISD::CONCAT_VECTORS,
58025 m_Value(Lo1)),
58027 m_Value(Hi1)))))) {
58028 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58029 concatSubVectors(Lo0, Hi0, DAG, DL),
58030 concatSubVectors(Lo1, Hi1, DAG, DL));
58031 }
58032 }
58033
58034 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58035 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58036 X86::isZeroNode(Op0.getOperand(1))) {
58037 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58038 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58039 Op0.getOperand(0), Op0.getOperand(2));
58040 }
58041
58042 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58043}
58044
58045// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58046// condition comes from the subtract node that produced -X. This matches the
58047// cmov expansion for absolute value. By swapping the operands we convert abs
58048// to nabs.
58049static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58050 SelectionDAG &DAG) {
58051 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58052 return SDValue();
58053
58054 SDValue Cond = N1.getOperand(3);
58055 if (Cond.getOpcode() != X86ISD::SUB)
58056 return SDValue();
58057 assert(Cond.getResNo() == 1 && "Unexpected result number");
58058
58059 SDValue FalseOp = N1.getOperand(0);
58060 SDValue TrueOp = N1.getOperand(1);
58062
58063 // ABS condition should come from a negate operation.
58064 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58065 isNullConstant(Cond.getOperand(0))) {
58066 // Get the X and -X from the negate.
58067 SDValue NegX = Cond.getValue(0);
58068 SDValue X = Cond.getOperand(1);
58069
58070 // Cmov operands should be X and NegX. Order doesn't matter.
58071 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58072 return SDValue();
58073
58074 // Build a new CMOV with the operands swapped.
58075 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58076 N1.getOperand(2), Cond);
58077 // Convert sub to add.
58078 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58079 }
58080
58081 // Handle ABD special case:
58082 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58083 // ABD condition should come from a pair of matching subtracts.
58084 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58085 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58086 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58087 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58088 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58089 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58090 // Build a new CMOV with the operands swapped.
58091 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58092 Cond);
58093 }
58094
58095 return SDValue();
58096}
58097
58099 SDValue Op0 = N->getOperand(0);
58100 SDValue Op1 = N->getOperand(1);
58101
58102 // (sub C (zero_extend (setcc)))
58103 // =>
58104 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58105 // Don't disturb (sub 0 setcc), which is easily done with neg.
58106 EVT VT = N->getValueType(0);
58107 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58108 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58109 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58110 Op1.getOperand(0).hasOneUse()) {
58111 SDValue SetCC = Op1.getOperand(0);
58114 APInt NewImm = Op0C->getAPIntValue() - 1;
58115 SDLoc DL(Op1);
58116 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58117 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58118 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58119 DAG.getConstant(NewImm, DL, VT));
58120 }
58121
58122 return SDValue();
58123}
58124
58126 if (N->getConstantOperandVal(3) != X86::COND_NE)
58127 return SDValue();
58128
58129 SDValue Sub = N->getOperand(4);
58130 if (Sub.getOpcode() != X86ISD::SUB)
58131 return SDValue();
58132
58133 SDValue Op1 = Sub.getOperand(1);
58134
58135 if (!X86::isZeroNode(Sub.getOperand(0)))
58136 return SDValue();
58137
58138 SDLoc DL(N);
58139 SmallVector<SDValue, 5> Ops(N->op_values());
58140 if (Op1.getOpcode() == X86ISD::SETCC) {
58141 // res, flags2 = sub 0, (setcc cc, flag)
58142 // cload/cstore ..., cond_ne, flag2
58143 // ->
58144 // cload/cstore cc, flag
58145 Ops[3] = Op1.getOperand(0);
58146 Ops[4] = Op1.getOperand(1);
58147 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58148 SDValue Src = Op1;
58149 SDValue Op10 = Op1.getOperand(0);
58150 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58151 // res, flags2 = sub 0, (and (xor X, -1), Y)
58152 // cload/cstore ..., cond_ne, flag2
58153 // ->
58154 // res, flags2 = sub 0, (and X, Y)
58155 // cload/cstore ..., cond_e, flag2
58156 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58157 Op1.getOperand(1));
58158 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58159 }
58160 // res, flags2 = sub 0, (and X, Y)
58161 // cload/cstore ..., cc, flag2
58162 // ->
58163 // res, flags2 = cmp (and X, Y), 0
58164 // cload/cstore ..., cc, flag2
58165 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58166 } else {
58167 return SDValue();
58168 }
58169
58170 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58171 cast<MemSDNode>(N)->getMemoryVT(),
58172 cast<MemSDNode>(N)->getMemOperand());
58173}
58174
58177 const X86Subtarget &Subtarget) {
58178 EVT VT = N->getValueType(0);
58179 SDValue Op0 = N->getOperand(0);
58180 SDValue Op1 = N->getOperand(1);
58181 SDLoc DL(N);
58182
58183 auto IsNonOpaqueConstant = [&](SDValue Op) {
58185 /*AllowOpaques*/ false);
58186 };
58187
58188 // X86 can't encode an immediate LHS of a sub. See if we can push the
58189 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58190 // one use and a constant, invert the immediate, saving one register.
58191 // However, ignore cases where C1 is 0, as those will become a NEG.
58192 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58193 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58194 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58195 Op1->hasOneUse()) {
58196 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58197 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58198 SDValue NewAdd =
58199 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58200 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58201 }
58202
58203 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58204 return V;
58205
58206 // Try to synthesize horizontal subs from subs of shuffles.
58207 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58208 return V;
58209
58210 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58211 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58212 X86::isZeroNode(Op1.getOperand(1))) {
58213 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58214 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58215 Op1.getOperand(0), Op1.getOperand(2));
58216 }
58217
58218 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58219 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58220 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58221 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58222 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58223 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58224 Op1.getOperand(1), Op1.getOperand(2));
58225 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58226 }
58227
58228 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58229 return V;
58230
58231 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58232 return V;
58233
58234 return combineSubSetcc(N, DAG);
58235}
58236
58238 const X86Subtarget &Subtarget) {
58239 unsigned Opcode = N->getOpcode();
58240 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58241 "Unknown PCMP opcode");
58242
58243 SDValue LHS = N->getOperand(0);
58244 SDValue RHS = N->getOperand(1);
58245 MVT VT = N->getSimpleValueType(0);
58246 unsigned EltBits = VT.getScalarSizeInBits();
58247 unsigned NumElts = VT.getVectorNumElements();
58248 SDLoc DL(N);
58249
58250 if (LHS == RHS)
58251 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58252 : DAG.getConstant(0, DL, VT);
58253
58254 // Constant Folding.
58255 // PCMPEQ(X,UNDEF) -> UNDEF
58256 // PCMPGT(X,UNDEF) -> 0
58257 // PCMPGT(UNDEF,X) -> 0
58258 APInt LHSUndefs, RHSUndefs;
58259 SmallVector<APInt> LHSBits, RHSBits;
58260 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58261 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58262 APInt Ones = APInt::getAllOnes(EltBits);
58263 APInt Zero = APInt::getZero(EltBits);
58264 SmallVector<APInt> Results(NumElts);
58265 for (unsigned I = 0; I != NumElts; ++I) {
58266 if (Opcode == X86ISD::PCMPEQ) {
58267 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58268 } else {
58269 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58270 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58271 }
58272 }
58273 if (Opcode == X86ISD::PCMPEQ)
58274 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58275 return getConstVector(Results, VT, DAG, DL);
58276 }
58277
58278 return SDValue();
58279}
58280
58281// Helper to determine if we can convert an integer comparison to a float
58282// comparison byt casting the operands.
58283static std::optional<unsigned>
58284CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58285 unsigned NumSignificantBitsRHS) {
58286 MVT SVT = VT.getScalarType();
58287 assert(SVT == MVT::f32 && "Only tested for float so far");
58288 const fltSemantics &Sem = SVT.getFltSemantics();
58289 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58290 "Only PCMPEQ/PCMPGT currently supported");
58291
58292 // TODO: Handle bitcastable integers.
58293
58294 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58295 // a fp value.
58296 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58297 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58298 return ISD::SINT_TO_FP;
58299
58300 return std::nullopt;
58301}
58302
58303/// Helper that combines an array of subvector ops as if they were the operands
58304/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58305/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58308 const X86Subtarget &Subtarget,
58309 unsigned Depth) {
58310 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58311 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58312
58313 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58314 return DAG.getUNDEF(VT);
58315
58316 if (llvm::all_of(Ops, [](SDValue Op) {
58317 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58318 }))
58319 return getZeroVector(VT, Subtarget, DAG, DL);
58320
58322 return SDValue(); // Limit search depth.
58323
58324 SDValue Op0 = Ops[0];
58325 bool IsSplat = llvm::all_equal(Ops);
58326 unsigned NumOps = Ops.size();
58327 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58328 LLVMContext &Ctx = *DAG.getContext();
58329
58330 // Repeated subvectors.
58331 if (IsSplat &&
58332 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58333 // If this broadcast is inserted into both halves, use a larger broadcast.
58334 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58335 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58336
58337 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58338 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58339 (Subtarget.hasAVX2() ||
58341 VT.getScalarType(), Subtarget)))
58342 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58343 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58344 Op0.getOperand(0),
58345 DAG.getVectorIdxConstant(0, DL)));
58346
58347 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58348 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58349 (Subtarget.hasAVX2() ||
58350 (EltSizeInBits >= 32 &&
58351 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58352 Op0.getOperand(0).getValueType() == VT.getScalarType())
58353 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58354
58355 // concat_vectors(extract_subvector(splat(x)),
58356 // extract_subvector(splat(x))) -> splat(x)
58357 // concat_vectors(extract_subvector(subv_broadcast(x)),
58358 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58359 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58360 Op0.getOperand(0).getValueType() == VT) {
58361 SDValue SrcVec = Op0.getOperand(0);
58362 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58363 return SrcVec;
58364 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58365 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58366 return SrcVec;
58367 }
58368
58369 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58370 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58371 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58372 return DAG.getNode(Op0.getOpcode(), DL, VT,
58374 Op0.getOperand(0), Op0.getOperand(0)),
58375 Op0.getOperand(1));
58376 }
58377
58378 // TODO: This should go in combineX86ShufflesRecursively eventually.
58379 if (NumOps == 2) {
58380 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58381 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58382 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58384 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58385 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58386 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58387 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58388 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58389 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58390 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58391 // Only concat of subvector high halves which vperm2x128 is best at or if
58392 // it should fold into a subvector broadcast.
58393 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58394 SrcVT1.is256BitVector()) {
58395 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58396 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58397 "Bad subvector index");
58398 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58399 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58400 unsigned Index = 0;
58401 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58402 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58403 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58404 DAG.getBitcast(VT, Src0.getOperand(0)),
58405 DAG.getBitcast(VT, Src1.getOperand(0)),
58406 DAG.getTargetConstant(Index, DL, MVT::i8));
58407 }
58408 }
58409 // Widen extract_subvector
58410 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58411 // --> extract_subvector(x,lo)
58412 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58413 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58414 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58415 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58416 return DAG.getBitcast(VT,
58418 Src0.getConstantOperandVal(1),
58419 DAG, DL, VT.getSizeInBits()));
58420 }
58421 }
58422 }
58423
58424 // Repeated opcode.
58425 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58426 // but it currently struggles with different vector widths.
58427 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58428 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58429 })) {
58430 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58432 for (SDValue SubOp : SubOps)
58433 Subs.push_back(SubOp.getOperand(I));
58434 // Attempt to peek through bitcasts and concat the original subvectors.
58435 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58436 if (SubVT.isSimple() && SubVT.isVector()) {
58437 MVT ConcatVT =
58439 SubVT.getVectorElementCount() * Subs.size());
58440 for (SDValue &Sub : Subs)
58441 Sub = DAG.getBitcast(SubVT, Sub);
58442 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58443 Subtarget, Depth + 1))
58444 return DAG.getBitcast(VT, ConcatSrc);
58445 return DAG.getBitcast(
58446 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58447 }
58448 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58449 };
58450 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58451 bool AllConstants = true;
58452 bool AllSubs = true;
58453 unsigned VecSize = VT.getSizeInBits();
58454 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58455 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58456 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58457 }))
58458 return true;
58459 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58460 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58461 unsigned SubSize = BC.getValueSizeInBits();
58462 unsigned EltSize = BC.getScalarValueSizeInBits();
58463 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58465 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58466 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58467 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58468 }
58469 return AllConstants || AllSubs;
58470 };
58471 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58472 bool AllConstants = true;
58474 for (SDValue SubOp : SubOps) {
58475 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58476 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58478 Subs.push_back(SubOp.getOperand(I));
58479 }
58480 if (AllConstants)
58481 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58482 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58483 };
58484
58485 unsigned Opcode = Op0.getOpcode();
58486 switch (Opcode) {
58487 case ISD::BITCAST: {
58488 // TODO: Support AVX1/AVX2 bitcasts.
58490 for (SDValue SubOp : Ops)
58491 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58492 EVT InnerVT = SubOps[0].getValueType();
58493 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58494 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58495 (Subtarget.hasBWI() ||
58496 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58497 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58498 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58499 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58500 return Op.getValueType() == InnerVT;
58501 })) {
58502 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58503 MVT ConcatVT = MVT::getVectorVT(
58504 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58505 if (SDValue ConcatSrc = combineConcatVectorOps(
58506 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58507 return DAG.getBitcast(VT, ConcatSrc);
58508 }
58509 break;
58510 }
58511 case ISD::VECTOR_SHUFFLE: {
58512 // TODO: Generalize NumOps support.
58513 if (!IsSplat && NumOps == 2 &&
58514 ((VT.is256BitVector() &&
58515 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58516 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58517 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58518 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58519 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58520 if (Concat0 || Concat1 ||
58521 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58522 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58523 Subtarget.hasVBMI())) {
58524 int NumSubElts = Op0.getValueType().getVectorNumElements();
58525 SmallVector<int> NewMask;
58526 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58527 M = M >= NumSubElts ? M + NumSubElts : M;
58528 NewMask.push_back(M);
58529 }
58530 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58531 if (0 <= M)
58532 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58533 NewMask.push_back(M);
58534 }
58535 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58536 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58537 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58538 }
58539 }
58540 break;
58541 }
58542 case X86ISD::VBROADCAST: {
58543 // TODO: 512-bit VBROADCAST concatenation.
58544 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58545 return Op.getOperand(0).getValueType().is128BitVector();
58546 })) {
58547 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58548 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58549 ConcatSubOperand(VT, Ops, 0),
58550 ConcatSubOperand(VT, Ops, 0));
58551 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58552 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58553 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58555 DL, VT, ConcatSubOperand(VT, Ops, 0),
58556 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58557 }
58558 break;
58559 }
58560 case X86ISD::MOVDDUP:
58561 case X86ISD::MOVSHDUP:
58562 case X86ISD::MOVSLDUP: {
58563 if (!IsSplat && (VT.is256BitVector() ||
58564 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58565 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58566 break;
58567 }
58568 case X86ISD::SHUFP: {
58569 if (!IsSplat &&
58570 (VT == MVT::v8f32 ||
58571 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58572 llvm::all_of(Ops, [Op0](SDValue Op) {
58573 return Op.getOperand(2) == Op0.getOperand(2);
58574 })) {
58575 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58576 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58577 if (Concat0 || Concat1)
58578 return DAG.getNode(Opcode, DL, VT,
58579 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58580 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58581 Op0.getOperand(2));
58582 }
58583 break;
58584 }
58585 case X86ISD::UNPCKH:
58586 case X86ISD::UNPCKL: {
58587 // TODO: UNPCK should use CombineSubOperand
58588 // Don't concatenate build_vector patterns.
58589 if (!IsSplat &&
58590 ((VT.is256BitVector() &&
58591 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58592 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58593 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58594 none_of(Ops, [](SDValue Op) {
58595 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58597 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58599 })) {
58600 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58601 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58602 if (Concat0 || Concat1 ||
58603 (Subtarget.hasInt256() && EltSizeInBits == 64))
58604 return DAG.getNode(Opcode, DL, VT,
58605 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58606 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58607 }
58608 break;
58609 }
58610 case X86ISD::PSHUFHW:
58611 case X86ISD::PSHUFLW:
58612 case X86ISD::PSHUFD:
58613 if (!IsSplat &&
58614 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58615 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58616 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58617 llvm::all_of(Ops, [Op0](SDValue Op) {
58618 return Op.getOperand(1) == Op0.getOperand(1);
58619 })) {
58620 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58621 Op0.getOperand(1));
58622 }
58623 [[fallthrough]];
58624 case X86ISD::VPERMILPI:
58625 if (!IsSplat && EltSizeInBits == 32 &&
58626 (VT.is256BitVector() ||
58627 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58628 all_of(Ops, [&Op0](SDValue Op) {
58629 return Op0.getOperand(1) == Op.getOperand(1);
58630 })) {
58631 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58632 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58633 Res =
58634 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58635 return DAG.getBitcast(VT, Res);
58636 }
58637 break;
58638 case X86ISD::VPERMILPV:
58639 if (!IsSplat && (VT.is256BitVector() ||
58640 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58641 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58642 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58643 if (Concat0 || Concat1)
58644 return DAG.getNode(Opcode, DL, VT,
58645 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58646 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58647 }
58648 break;
58649 case X86ISD::PSHUFB:
58650 case X86ISD::PSADBW:
58651 case X86ISD::VPMADDUBSW:
58652 case X86ISD::VPMADDWD:
58653 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58654 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58655 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58656 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58657 NumOps * SrcVT.getVectorNumElements());
58658 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58659 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58660 if (Concat0 || Concat1)
58661 return DAG.getNode(
58662 Opcode, DL, VT,
58663 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58664 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58665 }
58666 break;
58667 case X86ISD::VPERMV:
58668 // TODO: Handle 256-bit and NumOps == 4 cases.
58669 if (!IsSplat && NumOps == 2 &&
58670 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58671 MVT OpVT = Op0.getSimpleValueType();
58672 int NumSrcElts = OpVT.getVectorNumElements();
58673 SmallVector<int, 64> ConcatMask;
58674 for (unsigned i = 0; i != NumOps; ++i) {
58675 SmallVector<int, 64> SubMask;
58677 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58678 break;
58679 for (int M : SubMask) {
58680 if (0 <= M)
58681 M += i * NumSrcElts;
58682 ConcatMask.push_back(M);
58683 }
58684 }
58685 if (ConcatMask.size() == (NumOps * NumSrcElts))
58686 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58687 ConcatSubOperand(VT, Ops, 1),
58688 DAG.getUNDEF(VT), Subtarget, DAG);
58689 }
58690 break;
58691 case X86ISD::VPERMV3:
58692 // TODO: Handle 256-bit and NumOps == 4 cases.
58693 if (!IsSplat && NumOps == 2 &&
58694 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58695 MVT OpVT = Op0.getSimpleValueType();
58696 int NumSrcElts = OpVT.getVectorNumElements();
58697 SmallVector<int, 64> ConcatMask;
58698 for (unsigned i = 0; i != NumOps; ++i) {
58699 SmallVector<int, 64> SubMask;
58701 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58702 break;
58703 for (int M : SubMask) {
58704 if (0 <= M) {
58705 int Src = M < NumSrcElts ? 0 : 2;
58706 M += M < NumSrcElts ? 0 : NumSrcElts;
58707
58708 // Reference the lowest sub if the upper sub is the same.
58709 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58710 M += i * NumSrcElts;
58711 }
58712 ConcatMask.push_back(M);
58713 }
58714 }
58715 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58716 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58717 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58718 if (Concat0 || Concat1)
58719 return lowerShuffleWithPERMV(
58720 DL, VT, ConcatMask,
58721 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58722 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58723 DAG);
58724 }
58725 }
58726 break;
58727 case X86ISD::VPERM2X128: {
58728 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58729 assert(NumOps == 2 && "Bad concat_vectors operands");
58730 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58731 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58732 // TODO: Handle zero'd subvectors.
58733 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58734 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58735 (int)((Imm1 >> 4) & 0x3)};
58736 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58737 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58738 Ops[0].getOperand(1), DAG, DL);
58739 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58740 Ops[1].getOperand(1), DAG, DL);
58741 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58742 DAG.getBitcast(ShuffleVT, LHS),
58743 DAG.getBitcast(ShuffleVT, RHS),
58744 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58745 return DAG.getBitcast(VT, Res);
58746 }
58747 }
58748 break;
58749 }
58750 case X86ISD::SHUF128: {
58751 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58752 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58753 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58754 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58755 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58756 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58757 Ops[0].getOperand(1), DAG, DL);
58758 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58759 Ops[1].getOperand(1), DAG, DL);
58760 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58761 DAG.getTargetConstant(Imm, DL, MVT::i8));
58762 }
58763 break;
58764 }
58765 case ISD::TRUNCATE:
58766 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58767 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58768 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58769 SrcVT == Ops[1].getOperand(0).getValueType() &&
58770 Subtarget.useAVX512Regs() &&
58771 Subtarget.getPreferVectorWidth() >= 512 &&
58772 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58773 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58774 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58775 ConcatSubOperand(NewSrcVT, Ops, 0));
58776 }
58777 }
58778 break;
58779 case ISD::ANY_EXTEND:
58780 case ISD::SIGN_EXTEND:
58781 case ISD::ZERO_EXTEND:
58782 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58783 if (!IsSplat && NumOps == 2 &&
58784 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58785 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58786 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58787 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58788 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58789 SrcVT == Ops[1].getOperand(0).getValueType()) {
58790 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58791 return DAG.getNode(Opcode, DL, VT,
58792 ConcatSubOperand(NewSrcVT, Ops, 0));
58793 }
58794 }
58795 break;
58799 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58800 if (!IsSplat && NumOps == 2 &&
58801 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58802 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58803 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58805 Op0.getOperand(0).getValueType() ==
58806 Ops[0].getOperand(0).getValueType()) {
58807 EVT SrcVT = Op0.getOperand(0).getValueType();
58808 unsigned NumElts = VT.getVectorNumElements();
58809 MVT UnpackSVT =
58810 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58811 MVT UnpackVT =
58812 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58813 SDValue Unpack =
58814 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58815 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58816 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58817 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58818 DAG.getBitcast(SrcVT, Unpack), DAG);
58819 }
58820 break;
58821 }
58822 case X86ISD::VSHLI:
58823 case X86ISD::VSRLI:
58824 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58825 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58826 llvm::all_of(Ops, [](SDValue Op) {
58827 return Op.getConstantOperandAPInt(1) == 32;
58828 })) {
58829 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58830 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58831 Res = DAG.getBitcast(MVT::v8i32, Res);
58832 if (Opcode == X86ISD::VSHLI) {
58833 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58834 {8, 0, 8, 2, 8, 4, 8, 6});
58835 } else {
58836 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58837 {1, 8, 3, 8, 5, 8, 7, 8});
58838 }
58839 return DAG.getBitcast(VT, Res);
58840 }
58841 }
58842 [[fallthrough]];
58843 case X86ISD::VSRAI:
58844 case X86ISD::VSHL:
58845 case X86ISD::VSRL:
58846 case X86ISD::VSRA:
58847 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58848 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58849 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58850 llvm::all_of(Ops, [Op0](SDValue Op) {
58851 return Op0.getOperand(1) == Op.getOperand(1);
58852 })) {
58853 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58854 Op0.getOperand(1));
58855 }
58856 break;
58857 case X86ISD::VPERMI:
58858 case X86ISD::VROTLI:
58859 case X86ISD::VROTRI:
58860 if (!IsSplat &&
58861 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58862 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58863 llvm::all_of(Ops, [Op0](SDValue Op) {
58864 return Op0.getOperand(1) == Op.getOperand(1);
58865 })) {
58866 assert(!(Opcode == X86ISD::VPERMI &&
58867 Op0.getValueType().is128BitVector()) &&
58868 "Illegal 128-bit X86ISD::VPERMI nodes");
58869 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58870 Op0.getOperand(1));
58871 }
58872 break;
58873 case ISD::AND:
58874 case ISD::OR:
58875 case ISD::XOR:
58876 case X86ISD::ANDNP:
58877 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
58878 if (!IsSplat && (VT.is256BitVector() ||
58879 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58880 // Don't concatenate root AVX1 NOT patterns.
58881 // TODO: Allow NOT folding if Concat0 succeeds.
58882 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
58883 llvm::all_of(Ops, [](SDValue X) {
58884 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
58885 }))
58886 break;
58887 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58888 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58889 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
58890 return DAG.getNode(Opcode, DL, VT,
58891 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58892 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58893 }
58894 break;
58895 case X86ISD::PCMPEQ:
58896 case X86ISD::PCMPGT:
58897 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
58898 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
58899 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58900 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58901 if (Concat0 || Concat1)
58902 return DAG.getNode(Opcode, DL, VT,
58903 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58904 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58905 break;
58906 }
58907
58908 if (!IsSplat && VT == MVT::v8i32) {
58909 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
58910 // TODO: Handle v4f64 as well?
58911 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
58912 for (unsigned I = 0; I != NumOps; ++I) {
58913 MaxSigBitsLHS =
58914 std::max(MaxSigBitsLHS,
58915 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
58916 MaxSigBitsRHS =
58917 std::max(MaxSigBitsRHS,
58918 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
58919 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
58920 break;
58921 }
58922
58923 ISD::CondCode ICC =
58924 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
58925 ISD::CondCode FCC =
58927
58928 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
58929 MVT FpVT = VT.changeVectorElementType(FpSVT);
58930
58931 if (std::optional<unsigned> CastOpc =
58932 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
58933 SDValue LHS = CombineSubOperand(VT, Ops, 0);
58934 SDValue RHS = CombineSubOperand(VT, Ops, 1);
58935 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
58936 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
58937 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
58938 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
58939
58940 bool IsAlwaysSignaling;
58941 unsigned FSETCC =
58942 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
58943 return DAG.getBitcast(
58944 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
58945 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
58946 }
58947 }
58948 break;
58949 case ISD::CTPOP:
58950 case ISD::CTTZ:
58951 case ISD::CTLZ:
58954 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58955 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58956 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58957 }
58958 break;
58960 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
58961 if (!IsSplat &&
58962 (VT.is256BitVector() ||
58963 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58964 llvm::all_of(Ops, [Op0](SDValue Op) {
58965 return Op0.getOperand(2) == Op.getOperand(2);
58966 })) {
58967 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58968 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
58969 }
58970 break;
58971 case ISD::ADD:
58972 case ISD::SUB:
58973 case ISD::MUL:
58974 // TODO: Add more integer binops?
58975 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58976 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58977 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58978 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58979 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58980 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
58981 return Op.getOperand(0) == Op.getOperand(1);
58982 }))
58983 return DAG.getNode(Opcode, DL, VT,
58984 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58985 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58986 }
58987 break;
58988 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
58989 // their latency are short, so here we don't replace them unless we won't
58990 // introduce extra VINSERT.
58991 case ISD::FADD:
58992 case ISD::FSUB:
58993 case ISD::FMUL:
58994 if (!IsSplat && (VT.is256BitVector() ||
58995 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58996 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58997 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58998 if (Concat0 || Concat1)
58999 return DAG.getNode(Opcode, DL, VT,
59000 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59001 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59002 }
59003 break;
59004 // Always prefer to concatenate high latency FDIV instructions.
59005 case ISD::FDIV:
59006 if (!IsSplat && (VT.is256BitVector() ||
59007 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59008 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59009 ConcatSubOperand(VT, Ops, 1));
59010 }
59011 break;
59012 case X86ISD::HADD:
59013 case X86ISD::HSUB:
59014 case X86ISD::FHADD:
59015 case X86ISD::FHSUB:
59016 if (!IsSplat && VT.is256BitVector() &&
59017 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59018 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59019 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59020 if (Concat0 || Concat1)
59021 return DAG.getNode(Opcode, DL, VT,
59022 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59023 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59024 }
59025 break;
59026 case X86ISD::PACKSS:
59027 case X86ISD::PACKUS:
59028 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59029 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59030 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59031 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59032 NumOps * SrcVT.getVectorNumElements());
59033 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59034 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59035 if (Concat0 || Concat1)
59036 return DAG.getNode(
59037 Opcode, DL, VT,
59038 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59039 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59040 }
59041 break;
59042 case X86ISD::VSHLD:
59043 case X86ISD::VSHRD:
59044 case X86ISD::PALIGNR:
59045 if (!IsSplat &&
59046 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59047 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59048 llvm::all_of(Ops, [Op0](SDValue Op) {
59049 return Op0.getOperand(2) == Op.getOperand(2);
59050 })) {
59051 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59052 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59053 if (Concat0 || Concat1)
59054 return DAG.getNode(Opcode, DL, VT,
59055 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59056 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59057 Op0.getOperand(2));
59058 }
59059 break;
59060 case X86ISD::BLENDI:
59061 if (VT.is256BitVector() && NumOps == 2 &&
59062 (EltSizeInBits >= 32 ||
59063 (Subtarget.hasInt256() &&
59064 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59065 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59066 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59067 if (Concat0 || Concat1) {
59068 unsigned NumElts = VT.getVectorNumElements();
59069 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59070 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59071 Mask = Mask.zextOrTrunc(8);
59072 return DAG.getNode(Opcode, DL, VT,
59073 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59074 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59075 DAG.getTargetConstant(Mask, DL, MVT::i8));
59076 }
59077 }
59078 // TODO: BWI targets should only use CombineSubOperand.
59079 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59080 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59081 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59082 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59083 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59084 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59085 unsigned NumElts = VT.getVectorNumElements();
59086 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59087 for (unsigned I = 1; I != NumOps; ++I)
59088 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59089 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59090 Mask = Mask.zextOrTrunc(NumMaskBits);
59091 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59092 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59093 SDValue Sel =
59094 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59095 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59096 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59097 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59098 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59099 }
59100 }
59101 break;
59102 case ISD::VSELECT:
59103 // TODO: VSELECT should use CombineSubOperand.
59104 if (!IsSplat && Subtarget.hasAVX512() &&
59105 (VT.is256BitVector() ||
59106 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59107 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59108 EVT SelVT = Ops[0].getOperand(0).getValueType();
59109 if (SelVT.getVectorElementType() == MVT::i1) {
59110 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59111 NumOps * SelVT.getVectorNumElements());
59112 if (TLI.isTypeLegal(SelVT))
59113 return DAG.getNode(
59114 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59115 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59116 }
59117 }
59118 [[fallthrough]];
59119 case X86ISD::BLENDV:
59120 // TODO: BLENDV should use CombineSubOperand.
59121 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59122 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59123 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59124 EVT SelVT = Ops[0].getOperand(0).getValueType();
59125 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59126 if (TLI.isTypeLegal(SelVT))
59127 return DAG.getNode(
59128 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59129 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59130 }
59131 break;
59132 }
59133 }
59134
59135 // Fold subvector loads into one.
59136 // If needed, look through bitcasts to get to the load.
59137 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59138 unsigned Fast;
59139 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59140 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59141 *FirstLd->getMemOperand(), &Fast) &&
59142 Fast) {
59143 if (SDValue Ld =
59144 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59145 return Ld;
59146 }
59147 }
59148
59149 // Attempt to fold target constant loads.
59150 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59151 SmallVector<APInt> EltBits;
59152 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59153 for (unsigned I = 0; I != NumOps; ++I) {
59154 APInt OpUndefElts;
59155 SmallVector<APInt> OpEltBits;
59156 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59157 OpEltBits, /*AllowWholeUndefs*/ true,
59158 /*AllowPartialUndefs*/ false))
59159 break;
59160 EltBits.append(OpEltBits);
59161 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59162 }
59163 if (EltBits.size() == VT.getVectorNumElements()) {
59164 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59165 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59166 SDValue CV = DAG.getConstantPool(C, PVT);
59169 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59170 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59172 return Ld;
59173 }
59174 }
59175
59176 // If this simple subvector or scalar/subvector broadcast_load is inserted
59177 // into both halves, use a larger broadcast_load. Update other uses to use
59178 // an extracted subvector.
59179 if (IsSplat &&
59180 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59181 if (ISD::isNormalLoad(Op0.getNode()) ||
59184 auto *Mem = cast<MemSDNode>(Op0);
59185 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59188 if (SDValue BcastLd =
59189 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59190 SDValue BcastSrc =
59191 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59192 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59193 return BcastLd;
59194 }
59195 }
59196 }
59197
59198 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59199 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59200 Subtarget.useAVX512Regs()) {
59201 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59202 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59203 Res = DAG.getBitcast(ShuffleVT, Res);
59204 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59205 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59206 return DAG.getBitcast(VT, Res);
59207 }
59208
59209 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59210 if (!IsSplat &&
59211 ((NumOps == 2 && VT == MVT::v4f64) ||
59212 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59213 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59214 // Collect the individual per-lane v2f64/v4f64 shuffles.
59215 MVT OpVT = Ops[0].getSimpleValueType();
59216 unsigned NumOpElts = OpVT.getVectorNumElements();
59217 SmallVector<SmallVector<SDValue, 2>, 4> SrcOps(NumOps);
59218 SmallVector<SmallVector<int, 8>, 4> SrcMasks(NumOps);
59219 if (all_of(seq<int>(NumOps), [&](int I) {
59220 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59221 Depth + 1) &&
59222 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59223 none_of(SrcMasks[I], isUndefOrZero) &&
59224 SrcMasks[I].size() == NumOpElts &&
59225 all_of(SrcOps[I], [&OpVT](SDValue V) {
59226 return V.getValueType() == OpVT;
59227 });
59228 })) {
59229 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59230 bool Unary = true;
59231 unsigned SHUFPDMask = 0;
59232 SmallVector<SDValue, 4> LHS(NumOps), RHS(NumOps);
59233 for (unsigned I = 0; I != NumOps; ++I) {
59234 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59235 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59236 Unary &= LHS[I] == RHS[I];
59237 for (unsigned J = 0; J != NumOpElts; ++J)
59238 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59239 }
59240 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59241 // PERMILPD mask and we can always profitably concatenate them.
59242 SDValue Concat0 =
59243 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59244 SDValue Concat1 =
59245 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59246 if (Unary || Concat0 || Concat1) {
59247 Concat0 =
59248 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59249 Concat1 =
59250 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59251 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59252 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59253 }
59254 }
59255 }
59256
59257 return SDValue();
59258}
59259
59262 const X86Subtarget &Subtarget) {
59263 EVT VT = N->getValueType(0);
59264 EVT SrcVT = N->getOperand(0).getValueType();
59265 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59266 SmallVector<SDValue, 4> Ops(N->ops());
59267
59268 if (VT.getVectorElementType() == MVT::i1) {
59269 // Attempt to constant fold.
59270 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59272 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59273 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
59274 if (!C) break;
59275 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59276 if (I == (E - 1)) {
59277 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59278 if (TLI.isTypeLegal(IntVT))
59279 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59280 }
59281 }
59282
59283 // Don't do anything else for i1 vectors.
59284 return SDValue();
59285 }
59286
59287 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59288 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59289 Subtarget))
59290 return R;
59291 }
59292
59293 return SDValue();
59294}
59295
59298 const X86Subtarget &Subtarget) {
59299 if (DCI.isBeforeLegalizeOps())
59300 return SDValue();
59301
59302 MVT OpVT = N->getSimpleValueType(0);
59303
59304 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59305
59306 SDLoc dl(N);
59307 SDValue Vec = N->getOperand(0);
59308 SDValue SubVec = N->getOperand(1);
59309
59310 uint64_t IdxVal = N->getConstantOperandVal(2);
59311 MVT SubVecVT = SubVec.getSimpleValueType();
59312 int VecNumElts = OpVT.getVectorNumElements();
59313 int SubVecNumElts = SubVecVT.getVectorNumElements();
59314
59315 if (Vec.isUndef() && SubVec.isUndef())
59316 return DAG.getUNDEF(OpVT);
59317
59318 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59319 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59320 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59321 return getZeroVector(OpVT, Subtarget, DAG, dl);
59322
59324 // If we're inserting into a zero vector and then into a larger zero vector,
59325 // just insert into the larger zero vector directly.
59326 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59328 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59329 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59330 getZeroVector(OpVT, Subtarget, DAG, dl),
59331 SubVec.getOperand(1),
59332 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59333 }
59334
59335 // If we're inserting into a zero vector and our input was extracted from an
59336 // insert into a zero vector of the same type and the extraction was at
59337 // least as large as the original insertion. Just insert the original
59338 // subvector into a zero vector.
59339 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59340 isNullConstant(SubVec.getOperand(1)) &&
59342 SDValue Ins = SubVec.getOperand(0);
59343 if (isNullConstant(Ins.getOperand(2)) &&
59344 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59345 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59346 SubVecVT.getFixedSizeInBits())
59347 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59348 getZeroVector(OpVT, Subtarget, DAG, dl),
59349 Ins.getOperand(1), N->getOperand(2));
59350 }
59351 }
59352
59353 // Stop here if this is an i1 vector.
59354 if (IsI1Vector)
59355 return SDValue();
59356
59357 // Eliminate an intermediate vector widening:
59358 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59359 // insert_subvector X, Y, Idx
59360 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59361 // there?
59362 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59363 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59364 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59365 SubVec.getOperand(1), N->getOperand(2));
59366
59367 // If this is an insert of an extract, combine to a shuffle. Don't do this
59368 // if the insert or extract can be represented with a subregister operation.
59369 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59370 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59371 (IdxVal != 0 ||
59372 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59373 SDValue ExtSrc = SubVec.getOperand(0);
59374 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59375 // Create a shuffle mask matching the extraction and insertion.
59376 SmallVector<int, 64> Mask(VecNumElts);
59377 std::iota(Mask.begin(), Mask.end(), 0);
59378 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59379 ExtIdxVal + VecNumElts);
59380 if (ExtIdxVal != 0)
59381 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59382 // See if we can use a blend instead of extract/insert pair.
59383 SmallVector<int, 64> BlendMask(VecNumElts);
59384 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59385 std::iota(BlendMask.begin() + IdxVal,
59386 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59387 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59388 VecNumElts == (2 * SubVecNumElts)) {
59389 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59390 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59391 SDValue Blend = DAG.getNode(
59392 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59393 DAG.getBitcast(MVT::v8f32, ExtSrc),
59394 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59395 return DAG.getBitcast(OpVT, Blend);
59396 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59397 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59398 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59399 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59400 SDValue Shuffle =
59401 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59402 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59403 return DAG.getBitcast(OpVT, Shuffle);
59404 }
59405 }
59406 }
59407
59408 // Match concat_vector style patterns.
59409 SmallVector<SDValue, 2> SubVectorOps;
59410 if (collectConcatOps(N, SubVectorOps, DAG)) {
59411 if (SDValue Fold =
59412 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59413 return Fold;
59414
59415 // If we're inserting all zeros into the upper half, change this to
59416 // a concat with zero. We will match this to a move
59417 // with implicit upper bit zeroing during isel.
59418 // We do this here because we don't want combineConcatVectorOps to
59419 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59420 if (SubVectorOps.size() == 2 &&
59421 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59422 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59423 getZeroVector(OpVT, Subtarget, DAG, dl),
59424 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59425
59426 // Attempt to recursively combine to a shuffle.
59427 if (all_of(SubVectorOps, [](SDValue SubOp) {
59429 })) {
59430 SDValue Op(N, 0);
59431 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59432 return Res;
59433 }
59434 }
59435
59436 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59437 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59438 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59439
59440 // If this is a broadcast load inserted into an upper undef, use a larger
59441 // broadcast load.
59442 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59443 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59444 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59446 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59447 }
59448
59449 // If we're splatting the lower half subvector of a full vector load into the
59450 // upper half, attempt to create a subvector broadcast.
59451 if ((int)IdxVal == (VecNumElts / 2) &&
59452 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59453 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59454 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59455 if (VecLd && SubLd &&
59457 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59459 SubVecVT, SubLd, 0, DAG);
59460 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59461 BcastLd, DAG.getVectorIdxConstant(0, dl));
59462 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59463 return BcastLd;
59464 }
59465 }
59466
59467 // Attempt to constant fold (if we're not widening).
59468 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59469 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59470 APInt VecUndefElts, SubUndefElts;
59471 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59472 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59473 VecEltBits) &&
59474 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59475 SubEltBits)) {
59476 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59477 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59478 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59479 }
59480 }
59481
59482 // Attempt to recursively combine to a shuffle.
59485 SDValue Op(N, 0);
59486 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59487 return Res;
59488 }
59489
59490 // Match insertion of subvector load that perfectly aliases a base load.
59491 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59492 ISD::isNormalLoad(SubVec.getNode()) &&
59494 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59495 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59496 return Vec;
59497
59498 return SDValue();
59499}
59500
59501/// If we are extracting a subvector of a vector select and the select condition
59502/// is composed of concatenated vectors, try to narrow the select width. This
59503/// is a common pattern for AVX1 integer code because 256-bit selects may be
59504/// legal, but there is almost no integer math/logic available for 256-bit.
59505/// This function should only be called with legal types (otherwise, the calls
59506/// to get simple value types will assert).
59508 SelectionDAG &DAG) {
59509 SDValue Sel = Ext->getOperand(0);
59510 if (Sel.getOpcode() != ISD::VSELECT ||
59511 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59512 return SDValue();
59513
59514 // Note: We assume simple value types because this should only be called with
59515 // legal operations/types.
59516 // TODO: This can be extended to handle extraction to 256-bits.
59517 MVT VT = Ext->getSimpleValueType(0);
59518 if (!VT.is128BitVector())
59519 return SDValue();
59520
59521 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59522 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59523 return SDValue();
59524
59525 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59526 MVT SelVT = Sel.getSimpleValueType();
59527 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59528 "Unexpected vector type with legal operations");
59529
59530 unsigned SelElts = SelVT.getVectorNumElements();
59531 unsigned CastedElts = WideVT.getVectorNumElements();
59532 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59533 if (SelElts % CastedElts == 0) {
59534 // The select has the same or more (narrower) elements than the extract
59535 // operand. The extraction index gets scaled by that factor.
59536 ExtIdx *= (SelElts / CastedElts);
59537 } else if (CastedElts % SelElts == 0) {
59538 // The select has less (wider) elements than the extract operand. Make sure
59539 // that the extraction index can be divided evenly.
59540 unsigned IndexDivisor = CastedElts / SelElts;
59541 if (ExtIdx % IndexDivisor != 0)
59542 return SDValue();
59543 ExtIdx /= IndexDivisor;
59544 } else {
59545 llvm_unreachable("Element count of simple vector types are not divisible?");
59546 }
59547
59548 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59549 unsigned NarrowElts = SelElts / NarrowingFactor;
59550 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59551 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59552 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59553 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59554 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59555 return DAG.getBitcast(VT, NarrowSel);
59556}
59557
59560 const X86Subtarget &Subtarget) {
59561 if (!N->getValueType(0).isSimple())
59562 return SDValue();
59563
59564 MVT VT = N->getSimpleValueType(0);
59565 SDValue InVec = N->getOperand(0);
59566 unsigned IdxVal = N->getConstantOperandVal(1);
59567 EVT InVecVT = InVec.getValueType();
59568 unsigned SizeInBits = VT.getSizeInBits();
59569 unsigned InSizeInBits = InVecVT.getSizeInBits();
59570 unsigned NumSubElts = VT.getVectorNumElements();
59571 unsigned NumInElts = InVecVT.getVectorNumElements();
59572 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59573 SDLoc DL(N);
59574
59575 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59576 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59577 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59578 // We let generic combining take over from there to simplify the
59579 // insert/extract and 'not'.
59580 // This pattern emerges during AVX1 legalization. We handle it before lowering
59581 // to avoid complications like splitting constant vector loads.
59582 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59583 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59584 auto isConcatenatedNot = [](SDValue V) {
59585 V = peekThroughBitcasts(V);
59586 if (!isBitwiseNot(V))
59587 return false;
59588 SDValue NotOp = V->getOperand(0);
59590 };
59591 if (isConcatenatedNot(InVec.getOperand(0)) ||
59592 isConcatenatedNot(InVec.getOperand(1))) {
59593 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59594 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59595 splitVectorIntBinary(InVec, DAG, DL),
59596 N->getOperand(1));
59597 }
59598 }
59599
59600 if (DCI.isBeforeLegalizeOps())
59601 return SDValue();
59602
59603 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59604 return V;
59605
59607 return getZeroVector(VT, Subtarget, DAG, DL);
59608
59609 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59610 if (VT.getScalarType() == MVT::i1)
59611 return DAG.getConstant(1, DL, VT);
59612 return getOnesVector(VT, DAG, DL);
59613 }
59614
59615 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59616 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59617
59618 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59619 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59620 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59621 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59622 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59623 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59624 }
59625
59626 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59627 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59628 // iff SUB is entirely contained in the extraction.
59629 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59630 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59631 SDValue Src = InVec.getOperand(0);
59632 SDValue Sub = InVec.getOperand(1);
59633 EVT SubVT = Sub.getValueType();
59634 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59635 if (IdxVal <= InsIdx &&
59636 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59637 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59638 DAG.getVectorIdxConstant(IdxVal, DL));
59639 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59640 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59641 }
59642 }
59643
59644 // If we're extracting an upper subvector see if we'd get the same elements if
59645 // we extracted the lowest subvector instead which should allow
59646 // SimplifyDemandedVectorElts do more simplifications.
59647 if (IdxVal != 0) {
59648 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59649 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59650 });
59651 if (AllEquiv)
59652 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59653 }
59654
59655 // Check if we're extracting a whole broadcasted subvector.
59656 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59657 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59658 EVT MemVT = MemIntr->getMemoryVT();
59659 if (MemVT == VT) {
59660 // If this is the only use, we can replace with a regular load (this may
59661 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59662 // memory chain).
59663 if (InVec.hasOneUse()) {
59664 SDValue Ld =
59665 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59666 MemIntr->getMemOperand());
59667 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59668 return Ld;
59669 }
59670 }
59671 }
59672
59673 // Attempt to extract from the source of a shuffle vector.
59674 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59675 SmallVector<int, 32> ShuffleMask;
59676 SmallVector<int, 32> ScaledMask;
59677 SmallVector<SDValue, 2> ShuffleInputs;
59678 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59679 // Decode the shuffle mask and scale it so its shuffling subvectors.
59680 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59681 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59682 unsigned SubVecIdx = IdxVal / NumSubElts;
59683 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59684 return DAG.getUNDEF(VT);
59685 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59686 return getZeroVector(VT, Subtarget, DAG, DL);
59687 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59688 if (Src.getValueSizeInBits() == InSizeInBits) {
59689 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59690 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59691 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59692 DL, SizeInBits);
59693 }
59694 }
59695 }
59696
59697 auto IsExtractFree = [](SDValue V) {
59698 if (V.hasOneUse()) {
59700 if (V.getOpcode() == ISD::LOAD)
59701 return true;
59702 }
59703 V = peekThroughBitcasts(V);
59704 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59705 return true;
59707 return true;
59708 return V.isUndef();
59709 };
59710
59711 // If we're extracting the lowest subvector and we're the only user,
59712 // we may be able to perform this with a smaller vector width.
59713 unsigned InOpcode = InVec.getOpcode();
59714 if (InVec.hasOneUse()) {
59715 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59716 // v2f64 CVTDQ2PD(v4i32).
59717 if (InOpcode == ISD::SINT_TO_FP &&
59718 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59719 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59720 }
59721 // v2f64 CVTUDQ2PD(v4i32).
59722 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59723 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59724 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59725 }
59726 // v2f64 CVTPS2PD(v4f32).
59727 if (InOpcode == ISD::FP_EXTEND &&
59728 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59729 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59730 }
59731 }
59732 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59733 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59734 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59735 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59736 Subtarget.hasVLX())) &&
59737 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59738 SDValue Src = InVec.getOperand(0);
59739 if (Src.getValueType().getScalarSizeInBits() == 32)
59740 return DAG.getNode(InOpcode, DL, VT,
59741 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59742 }
59743 if (IdxVal == 0 &&
59744 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59745 (SizeInBits == 128 || SizeInBits == 256) &&
59746 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59747 SDValue Ext = InVec.getOperand(0);
59748 if (Ext.getValueSizeInBits() > SizeInBits)
59749 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59750 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59751 return DAG.getNode(ExtOp, DL, VT, Ext);
59752 }
59753 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59754 InVec.getOperand(0).getValueType().is256BitVector() &&
59755 InVec.getOperand(1).getValueType().is256BitVector() &&
59756 InVec.getOperand(2).getValueType().is256BitVector()) {
59757 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59758 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59759 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59760 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59761 }
59762 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59763 (SizeInBits == 128 || SizeInBits == 256)) {
59764 SDValue InVecSrc = InVec.getOperand(0);
59765 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59766 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59767 return DAG.getNode(InOpcode, DL, VT, Ext);
59768 }
59769
59770 if (SizeInBits == 128 || SizeInBits == 256) {
59771 switch (InOpcode) {
59772 case X86ISD::MOVDDUP:
59773 return DAG.getNode(
59774 InOpcode, DL, VT,
59775 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59776 case X86ISD::PSHUFD:
59777 case X86ISD::VPERMILPI:
59778 if (InVec.getOperand(0).hasOneUse()) {
59779 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59780 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59781 return DAG.getNode(InOpcode, DL, VT,
59782 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59783 DL, SizeInBits),
59784 DAG.getTargetConstant(M, DL, MVT::i8));
59785 }
59786 break;
59787 case X86ISD::PCMPEQ:
59788 case X86ISD::PCMPGT:
59789 case X86ISD::UNPCKH:
59790 case X86ISD::UNPCKL:
59791 if (IsExtractFree(InVec.getOperand(0)) ||
59792 IsExtractFree(InVec.getOperand(1)))
59793 return DAG.getNode(InOpcode, DL, VT,
59794 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59795 DL, SizeInBits),
59796 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59797 DL, SizeInBits));
59798 break;
59799 case X86ISD::CMPP:
59800 if (IsExtractFree(InVec.getOperand(0)) ||
59801 IsExtractFree(InVec.getOperand(1)))
59802 return DAG.getNode(InOpcode, DL, VT,
59803 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59804 DL, SizeInBits),
59805 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59806 DL, SizeInBits),
59807 InVec.getOperand(2));
59808 break;
59809 case X86ISD::BLENDI:
59810 if (IsExtractFree(InVec.getOperand(0)) ||
59811 IsExtractFree(InVec.getOperand(1))) {
59812 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59813 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59814 return DAG.getNode(InOpcode, DL, VT,
59815 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59816 DL, SizeInBits),
59817 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59818 DL, SizeInBits),
59819 DAG.getTargetConstant(M, DL, MVT::i8));
59820 }
59821 break;
59822 case X86ISD::VPERMV:
59823 if (IdxVal != 0) {
59824 SDValue Mask = InVec.getOperand(0);
59825 SDValue Src = InVec.getOperand(1);
59826 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59827 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59828 DL, InSizeInBits);
59829 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59830 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59831 }
59832 break;
59833 case X86ISD::VPERMV3:
59834 if (IdxVal != 0) {
59835 SDValue Src0 = InVec.getOperand(0);
59836 SDValue Mask = InVec.getOperand(1);
59837 SDValue Src1 = InVec.getOperand(2);
59838 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59839 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59840 DL, InSizeInBits);
59841 SDValue Shuffle =
59842 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59843 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59844 }
59845 break;
59846 }
59847 }
59848 }
59849
59850 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59851 // as this is very likely to fold into a shuffle/truncation.
59852 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59853 InVecVT.getScalarSizeInBits() == 64 &&
59854 InVec.getConstantOperandAPInt(1) == 32) {
59855 SDValue Ext =
59856 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59857 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59858 }
59859
59860 return SDValue();
59861}
59862
59864 const X86Subtarget &Subtarget) {
59865 using namespace SDPatternMatch;
59866 EVT VT = N->getValueType(0);
59867 SDValue Src = N->getOperand(0);
59868 SDLoc DL(N);
59869
59870 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
59871 // This occurs frequently in our masked scalar intrinsic code and our
59872 // floating point select lowering with AVX512.
59873 // TODO: SimplifyDemandedBits instead?
59874 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
59875 isOneConstant(Src.getOperand(1)))
59876 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
59877
59878 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
59879 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
59880 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
59881 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
59882 isNullConstant(Src.getOperand(1)))
59883 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
59884 Src.getOperand(1));
59885
59886 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
59887 // TODO: Move to DAGCombine/SimplifyDemandedBits?
59888 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
59889 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
59890 if (Op.getValueType() != MVT::i64)
59891 return SDValue();
59892 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
59893 if (Op.getOpcode() == Opc &&
59894 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
59895 return Op.getOperand(0);
59896 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
59897 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
59898 if (Ld->getExtensionType() == Ext &&
59899 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
59900 return Op;
59901 if (IsZeroExt) {
59902 KnownBits Known = DAG.computeKnownBits(Op);
59903 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
59904 return Op;
59905 }
59906 return SDValue();
59907 };
59908
59909 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
59910 return DAG.getBitcast(
59911 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
59912 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
59913
59914 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
59915 return DAG.getBitcast(
59916 VT,
59917 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
59918 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
59919 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
59920 }
59921
59922 if (Src.getOpcode() == ISD::BITCAST) {
59923 SDValue SrcOp = Src.getOperand(0);
59924 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
59925 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
59926 return DAG.getBitcast(
59927 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
59928 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
59929 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
59930 return DAG.getBitcast(
59931 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
59932 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
59933 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
59934 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
59935 }
59936
59937 if (VT == MVT::v4i32) {
59938 SDValue HalfSrc;
59939 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
59940 // to remove XMM->GPR->XMM moves.
59941 if (sd_match(Src, m_AnyExt(m_BitCast(
59942 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
59943 return DAG.getBitcast(
59944 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
59945 }
59946
59947 // See if we're broadcasting the scalar value, in which case just reuse that.
59948 // Ensure the same SDValue from the SDNode use is being used.
59949 if (VT.getScalarType() == Src.getValueType())
59950 for (SDNode *User : Src->users())
59951 if (User->getOpcode() == X86ISD::VBROADCAST &&
59952 Src == User->getOperand(0)) {
59953 unsigned SizeInBits = VT.getFixedSizeInBits();
59954 unsigned BroadcastSizeInBits =
59955 User->getValueSizeInBits(0).getFixedValue();
59956 if (BroadcastSizeInBits == SizeInBits)
59957 return SDValue(User, 0);
59958 if (BroadcastSizeInBits > SizeInBits)
59959 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
59960 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
59961 // coverage.
59962 }
59963
59964 // Check for cases where we've ended up with a scalarized shift, typically
59965 // during type legalization.
59966 switch (Src.getOpcode()) {
59967 case ISD::SHL:
59968 case ISD::SRL:
59969 case ISD::SRA:
59970 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
59971 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
59972 Src.hasOneUse()) {
59973 SDValue SrcVec =
59974 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
59975 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
59976 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
59977 Amt->getZExtValue(), DAG);
59978 }
59979 }
59980 break;
59981 case ISD::FSHL:
59982 case ISD::FSHR:
59983 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
59984 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
59985 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
59986 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
59987 Src.hasOneUse()) {
59988 uint64_t AmtVal =
59989 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
59990 SDValue SrcVec0 =
59991 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
59992 SDValue SrcVec1 =
59993 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
59994 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
59995 DAG.getConstant(AmtVal, DL, VT));
59996 }
59997 }
59998 break;
59999 }
60000
60001 return SDValue();
60002}
60003
60004// Simplify PMULDQ and PMULUDQ operations.
60007 const X86Subtarget &Subtarget) {
60008 SDValue LHS = N->getOperand(0);
60009 SDValue RHS = N->getOperand(1);
60010
60011 // Canonicalize constant to RHS.
60014 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60015
60016 // Multiply by zero.
60017 // Don't return RHS as it may contain UNDEFs.
60018 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60019 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60020
60021 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60022 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60023 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60024 return SDValue(N, 0);
60025
60026 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60027 // convert it to any_extend_invec, due to the LegalOperations check, do the
60028 // conversion directly to a vector shuffle manually. This exposes combine
60029 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60030 // combineX86ShufflesRecursively on SSE4.1 targets.
60031 // FIXME: This is basically a hack around several other issues related to
60032 // ANY_EXTEND_VECTOR_INREG.
60033 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60034 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60035 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60036 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60037 SDLoc dl(N);
60038 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60039 LHS.getOperand(0), { 0, -1, 1, -1 });
60040 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60041 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60042 }
60043 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60044 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60045 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60046 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60047 SDLoc dl(N);
60048 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60049 RHS.getOperand(0), { 0, -1, 1, -1 });
60050 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60051 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60052 }
60053
60054 return SDValue();
60055}
60056
60057// Simplify VPMADDUBSW/VPMADDWD operations.
60060 MVT VT = N->getSimpleValueType(0);
60061 SDValue LHS = N->getOperand(0);
60062 SDValue RHS = N->getOperand(1);
60063 unsigned Opc = N->getOpcode();
60064 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60066 "Unexpected PMADD opcode");
60067
60068 // Multiply by zero.
60069 // Don't return LHS/RHS as it may contain UNDEFs.
60070 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60072 return DAG.getConstant(0, SDLoc(N), VT);
60073
60074 // Constant folding.
60075 APInt LHSUndefs, RHSUndefs;
60076 SmallVector<APInt> LHSBits, RHSBits;
60077 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60078 unsigned DstEltBits = VT.getScalarSizeInBits();
60079 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60080 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60081 SmallVector<APInt> Result;
60082 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60083 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60084 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60085 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60086 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60087 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60088 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60089 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60090 Result.push_back(Res);
60091 }
60092 return getConstVector(Result, VT, DAG, SDLoc(N));
60093 }
60094
60095 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60096 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60097 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60098 return SDValue(N, 0);
60099
60100 return SDValue();
60101}
60102
60103// Simplify VPMADD52L/VPMADD52H operations.
60106 MVT VT = N->getSimpleValueType(0);
60107 unsigned NumEltBits = VT.getScalarSizeInBits();
60108 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60109 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60110 DCI))
60111 return SDValue(N, 0);
60112
60113 return SDValue();
60114}
60115
60118 const X86Subtarget &Subtarget) {
60119 EVT VT = N->getValueType(0);
60120 SDValue In = N->getOperand(0);
60121 unsigned Opcode = N->getOpcode();
60122 unsigned InOpcode = In.getOpcode();
60123 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60124 SDLoc DL(N);
60125
60126 // Try to merge vector loads and extend_inreg to an extload.
60127 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60128 In.hasOneUse()) {
60129 auto *Ld = cast<LoadSDNode>(In);
60130 if (Ld->isSimple()) {
60131 MVT SVT = In.getSimpleValueType().getVectorElementType();
60134 : ISD::ZEXTLOAD;
60135 EVT MemVT = VT.changeVectorElementType(SVT);
60136 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60137 SDValue Load = DAG.getExtLoad(
60138 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60139 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60140 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60141 return Load;
60142 }
60143 }
60144 }
60145
60146 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60147 if (Opcode == InOpcode)
60148 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60149
60150 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60151 // -> EXTEND_VECTOR_INREG(X).
60152 // TODO: Handle non-zero subvector indices.
60153 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60154 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60155 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60156 In.getValueSizeInBits())
60157 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60158
60159 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60160 // TODO: Move to DAGCombine?
60161 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60162 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60163 In.getValueSizeInBits() == VT.getSizeInBits()) {
60164 unsigned NumElts = VT.getVectorNumElements();
60165 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60166 EVT EltVT = In.getOperand(0).getValueType();
60167 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60168 for (unsigned I = 0; I != NumElts; ++I)
60169 Elts[I * Scale] = In.getOperand(I);
60170 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60171 }
60172
60173 // Attempt to combine as a shuffle on SSE41+ targets.
60174 if (Subtarget.hasSSE41()) {
60175 SDValue Op(N, 0);
60176 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60177 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60178 return Res;
60179 }
60180
60181 return SDValue();
60182}
60183
60186 EVT VT = N->getValueType(0);
60187 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60188 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60189 return DAG.getConstant(0, SDLoc(N), VT);
60190
60191 // Fold kshiftr(extract_subvector(X,C1),C2)
60192 // --> extract_subvector(kshiftr(X,C1+C2),0)
60193 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60194 if (N->getOpcode() == X86ISD::KSHIFTR) {
60195 SDLoc DL(N);
60196 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60197 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60198 SDValue Src = N->getOperand(0).getOperand(0);
60199 uint64_t Amt = N->getConstantOperandVal(1) +
60200 N->getOperand(0).getConstantOperandVal(1);
60201 EVT SrcVT = Src.getValueType();
60202 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60203 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60204 DAG.getTargetConstant(Amt, DL, MVT::i8));
60205 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60206 DAG.getVectorIdxConstant(0, DL));
60207 }
60208 }
60209 }
60210
60211 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60212 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60213 return SDValue(N, 0);
60214
60215 return SDValue();
60216}
60217
60218// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60219// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60220// extra instructions between the conversion due to going to scalar and back.
60222 const X86Subtarget &Subtarget) {
60223 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60224 return SDValue();
60225
60226 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60227 return SDValue();
60228
60229 if (N->getValueType(0) != MVT::f32 ||
60230 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60231 return SDValue();
60232
60233 SDLoc dl(N);
60234 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60235 N->getOperand(0).getOperand(0));
60236 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60237 DAG.getTargetConstant(4, dl, MVT::i32));
60238 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60239 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60240 DAG.getVectorIdxConstant(0, dl));
60241}
60242
60245 const X86Subtarget &Subtarget) {
60246 EVT VT = N->getValueType(0);
60247 bool IsStrict = N->isStrictFPOpcode();
60248 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60249 EVT SrcVT = Src.getValueType();
60250
60251 SDLoc dl(N);
60252 if (SrcVT.getScalarType() == MVT::bf16) {
60253 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60254 !IsStrict && Src.getOperand(0).getValueType() == VT)
60255 return Src.getOperand(0);
60256
60257 if (!SrcVT.isVector())
60258 return SDValue();
60259
60260 assert(!IsStrict && "Strict FP doesn't support BF16");
60261 if (VT.getVectorElementType() == MVT::f64) {
60262 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60263 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60264 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60265 }
60266 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60267 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60268 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60269 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60270 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60271 return DAG.getBitcast(VT, Src);
60272 }
60273
60274 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60275 return SDValue();
60276
60277 if (Subtarget.hasFP16())
60278 return SDValue();
60279
60280 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60281 return SDValue();
60282
60283 if (VT.getVectorElementType() != MVT::f32 &&
60284 VT.getVectorElementType() != MVT::f64)
60285 return SDValue();
60286
60287 unsigned NumElts = VT.getVectorNumElements();
60288 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60289 return SDValue();
60290
60291 // Convert the input to vXi16.
60292 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60293 Src = DAG.getBitcast(IntVT, Src);
60294
60295 // Widen to at least 8 input elements.
60296 if (NumElts < 8) {
60297 unsigned NumConcats = 8 / NumElts;
60298 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60299 : DAG.getConstant(0, dl, IntVT);
60300 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60301 Ops[0] = Src;
60302 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60303 }
60304
60305 // Destination is vXf32 with at least 4 elements.
60306 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60307 std::max(4U, NumElts));
60308 SDValue Cvt, Chain;
60309 if (IsStrict) {
60310 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60311 {N->getOperand(0), Src});
60312 Chain = Cvt.getValue(1);
60313 } else {
60314 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60315 }
60316
60317 if (NumElts < 4) {
60318 assert(NumElts == 2 && "Unexpected size");
60319 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60320 DAG.getVectorIdxConstant(0, dl));
60321 }
60322
60323 if (IsStrict) {
60324 // Extend to the original VT if necessary.
60325 if (Cvt.getValueType() != VT) {
60326 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60327 {Chain, Cvt});
60328 Chain = Cvt.getValue(1);
60329 }
60330 return DAG.getMergeValues({Cvt, Chain}, dl);
60331 }
60332
60333 // Extend to the original VT if necessary.
60334 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60335}
60336
60337// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60340 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60341 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60342 "Unknown broadcast load type");
60343
60344 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60345 SDValue Ptr = MemIntrin->getBasePtr();
60346 SDValue Chain = MemIntrin->getChain();
60347 EVT VT = N->getSimpleValueType(0);
60348 EVT MemVT = MemIntrin->getMemoryVT();
60349
60350 // Look at other users of our base pointer and try to find a wider broadcast.
60351 // The input chain and the size of the memory VT must match.
60352 for (SDNode *User : Ptr->users())
60353 if (User != N && User->getOpcode() == N->getOpcode() &&
60354 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60355 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60356 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60357 MemVT.getSizeInBits() &&
60358 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60359 assert(cast<MemIntrinsicSDNode>(User)->isSimple() &&
60360 MemIntrin->isSimple() && "Illegal broadcast load type");
60362 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60363 VT.getSizeInBits());
60364 Extract = DAG.getBitcast(VT, Extract);
60365 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60366 return Extract;
60367 }
60368
60369 return SDValue();
60370}
60371
60373 const X86Subtarget &Subtarget) {
60374 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60375 return SDValue();
60376
60377 bool IsStrict = N->isStrictFPOpcode();
60378 EVT VT = N->getValueType(0);
60379 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60380 EVT SrcVT = Src.getValueType();
60381
60382 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60383 SrcVT.getVectorElementType() != MVT::f32)
60384 return SDValue();
60385
60386 SDLoc dl(N);
60387
60388 SDValue Cvt, Chain;
60389 unsigned NumElts = VT.getVectorNumElements();
60390 if (Subtarget.hasFP16()) {
60391 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60392 // v4f32 (xint_to_fp v4i64))))
60393 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60394 // v8f16 (CVTXI2P v4i64)))
60395 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60396 Src.getNumOperands() == 2) {
60397 SDValue Cvt0, Cvt1;
60398 SDValue Op0 = Src.getOperand(0);
60399 SDValue Op1 = Src.getOperand(1);
60400 bool IsOp0Strict = Op0->isStrictFPOpcode();
60401 if (Op0.getOpcode() != Op1.getOpcode() ||
60402 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60403 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60404 return SDValue();
60405 }
60406 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60407 if (IsStrict) {
60408 assert(IsOp0Strict && "Op0 must be strict node");
60409 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60412 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60413 {Op0.getOperand(0), Op0.getOperand(1)});
60414 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60415 {Op1.getOperand(0), Op1.getOperand(1)});
60416 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60417 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60418 }
60419 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60421 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60422 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60423 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60424 }
60425 return SDValue();
60426 }
60427
60428 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60429 return SDValue();
60430
60431 // Widen to at least 4 input elements.
60432 if (NumElts < 4)
60433 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60434 DAG.getConstantFP(0.0, dl, SrcVT));
60435
60436 // Destination is v8i16 with at least 8 elements.
60437 EVT CvtVT =
60438 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60439 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60440 if (IsStrict) {
60441 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60442 {N->getOperand(0), Src, Rnd});
60443 Chain = Cvt.getValue(1);
60444 } else {
60445 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60446 }
60447
60448 // Extract down to real number of elements.
60449 if (NumElts < 8) {
60451 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60452 DAG.getVectorIdxConstant(0, dl));
60453 }
60454
60455 Cvt = DAG.getBitcast(VT, Cvt);
60456
60457 if (IsStrict)
60458 return DAG.getMergeValues({Cvt, Chain}, dl);
60459
60460 return Cvt;
60461}
60462
60464 SDValue Src = N->getOperand(0);
60465
60466 // Turn MOVDQ2Q+simple_load into an mmx load.
60467 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60468 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60469
60470 if (LN->isSimple()) {
60471 SDValue NewLd =
60472 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60473 LN->getPointerInfo(), LN->getBaseAlign(),
60474 LN->getMemOperand()->getFlags());
60475 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60476 return NewLd;
60477 }
60478 }
60479
60480 return SDValue();
60481}
60482
60485 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60486 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60487 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60488 return SDValue(N, 0);
60489
60490 return SDValue();
60491}
60492
60493// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60494// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60495// use x86mmx instead.
60497 SDLoc dl(N);
60498
60499 bool MadeChange = false, CastReturnVal = false;
60501 for (const SDValue &Arg : N->op_values()) {
60502 if (Arg.getValueType() == MVT::v1i64) {
60503 MadeChange = true;
60504 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60505 } else
60506 Args.push_back(Arg);
60507 }
60508 SDVTList VTs = N->getVTList();
60509 SDVTList NewVTs = VTs;
60510 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60511 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60512 NewVTArr[0] = MVT::x86mmx;
60513 NewVTs = DAG.getVTList(NewVTArr);
60514 MadeChange = true;
60515 CastReturnVal = true;
60516 }
60517
60518 if (MadeChange) {
60519 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60520 if (CastReturnVal) {
60522 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60523 Returns.push_back(Result.getValue(i));
60524 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60525 return DAG.getMergeValues(Returns, dl);
60526 }
60527 return Result;
60528 }
60529 return SDValue();
60530}
60533 if (!DCI.isBeforeLegalize())
60534 return SDValue();
60535
60536 unsigned IntNo = N->getConstantOperandVal(0);
60537 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60538
60539 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60540 return FixupMMXIntrinsicTypes(N, DAG);
60541
60542 return SDValue();
60543}
60544
60547 if (!DCI.isBeforeLegalize())
60548 return SDValue();
60549
60550 unsigned IntNo = N->getConstantOperandVal(1);
60551 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60552
60553 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60554 return FixupMMXIntrinsicTypes(N, DAG);
60555
60556 return SDValue();
60557}
60558
60561 if (!DCI.isBeforeLegalize())
60562 return SDValue();
60563
60564 unsigned IntNo = N->getConstantOperandVal(1);
60565 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60566
60567 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60568 return FixupMMXIntrinsicTypes(N, DAG);
60569
60570 return SDValue();
60571}
60572
60574 DAGCombinerInfo &DCI) const {
60575 SelectionDAG &DAG = DCI.DAG;
60576 switch (N->getOpcode()) {
60577 // clang-format off
60578 default: break;
60580 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60582 case X86ISD::PEXTRW:
60583 case X86ISD::PEXTRB:
60584 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60586 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60588 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60590 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60591 case ISD::VSELECT:
60592 case ISD::SELECT:
60593 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60594 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60595 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60596 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60597 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60598 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60599 case X86ISD::ADD:
60600 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60601 case X86ISD::CLOAD:
60602 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60603 case X86ISD::SBB: return combineSBB(N, DAG);
60604 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60605 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60606 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60607 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60608 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60609 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60610 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60611 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60612 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60613 case ISD::AVGCEILS:
60614 case ISD::AVGCEILU:
60615 case ISD::AVGFLOORS:
60616 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60617 case X86ISD::BEXTR:
60618 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60619 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60620 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60621 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60622 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60624 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60625 case ISD::SINT_TO_FP:
60627 return combineSIntToFP(N, DAG, DCI, Subtarget);
60628 case ISD::UINT_TO_FP:
60630 return combineUIntToFP(N, DAG, Subtarget);
60631 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60632 case ISD::LRINT:
60633 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60634 case ISD::FADD:
60635 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60636 case X86ISD::VFCMULC:
60637 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60638 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60639 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60640 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60641 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60642 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60643 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60644 case X86ISD::FXOR:
60645 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60646 case X86ISD::FMIN:
60647 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60648 case ISD::FMINNUM:
60649 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60650 case X86ISD::CVTSI2P:
60651 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60652 case X86ISD::CVTP2SI:
60653 case X86ISD::CVTP2UI:
60655 case X86ISD::CVTTP2SI:
60657 case X86ISD::CVTTP2UI:
60658 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60660 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60661 case X86ISD::BT: return combineBT(N, DAG, DCI);
60662 case ISD::ANY_EXTEND:
60663 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60664 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60665 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60669 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60670 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60671 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60672 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60673 case X86ISD::PACKSS:
60674 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60675 case X86ISD::HADD:
60676 case X86ISD::HSUB:
60677 case X86ISD::FHADD:
60678 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60679 case X86ISD::VSHL:
60680 case X86ISD::VSRA:
60681 case X86ISD::VSRL:
60682 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60683 case X86ISD::VSHLI:
60684 case X86ISD::VSRAI:
60685 case X86ISD::VSRLI:
60686 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60688 case X86ISD::PINSRB:
60689 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60690 case X86ISD::SHUFP: // Handle all target specific shuffles
60691 case X86ISD::INSERTPS:
60692 case X86ISD::EXTRQI:
60693 case X86ISD::INSERTQI:
60694 case X86ISD::VALIGN:
60695 case X86ISD::PALIGNR:
60696 case X86ISD::VSHLDQ:
60697 case X86ISD::VSRLDQ:
60698 case X86ISD::BLENDI:
60699 case X86ISD::UNPCKH:
60700 case X86ISD::UNPCKL:
60701 case X86ISD::MOVHLPS:
60702 case X86ISD::MOVLHPS:
60703 case X86ISD::PSHUFB:
60704 case X86ISD::PSHUFD:
60705 case X86ISD::PSHUFHW:
60706 case X86ISD::PSHUFLW:
60707 case X86ISD::MOVSHDUP:
60708 case X86ISD::MOVSLDUP:
60709 case X86ISD::MOVDDUP:
60710 case X86ISD::MOVSS:
60711 case X86ISD::MOVSD:
60712 case X86ISD::MOVSH:
60713 case X86ISD::VBROADCAST:
60714 case X86ISD::VPPERM:
60715 case X86ISD::VPERMI:
60716 case X86ISD::VPERMV:
60717 case X86ISD::VPERMV3:
60718 case X86ISD::VPERMIL2:
60719 case X86ISD::VPERMILPI:
60720 case X86ISD::VPERMILPV:
60721 case X86ISD::VPERM2X128:
60722 case X86ISD::SHUF128:
60723 case X86ISD::VZEXT_MOVL:
60724 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60725 case X86ISD::FMADD_RND:
60726 case X86ISD::FMSUB:
60728 case X86ISD::FMSUB_RND:
60729 case X86ISD::FNMADD:
60731 case X86ISD::FNMADD_RND:
60732 case X86ISD::FNMSUB:
60734 case X86ISD::FNMSUB_RND:
60735 case ISD::FMA:
60736 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60739 case X86ISD::FMADDSUB:
60740 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60741 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60742 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60743 case X86ISD::MGATHER:
60744 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60745 case ISD::MGATHER:
60746 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60747 case X86ISD::PCMPEQ:
60748 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60749 case X86ISD::PMULDQ:
60750 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60751 case X86ISD::VPMADDUBSW:
60752 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60753 case X86ISD::VPMADD52L:
60754 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60755 case X86ISD::KSHIFTL:
60756 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60757 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60759 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60761 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60763 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60764 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60765 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60766 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60767 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60768 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60770 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60771 // clang-format on
60772 }
60773
60774 return SDValue();
60775}
60776
60778 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60779}
60780
60781// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60783 EVT ExtVT) const {
60784 return Subtarget.hasAVX512() || !VT.isVector();
60785}
60786
60788 if (!isTypeLegal(VT))
60789 return false;
60790
60791 // There are no vXi8 shifts.
60792 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60793 return false;
60794
60795 // TODO: Almost no 8-bit ops are desirable because they have no actual
60796 // size/speed advantages vs. 32-bit ops, but they do have a major
60797 // potential disadvantage by causing partial register stalls.
60798 //
60799 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60800 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60801 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60802 // check for a constant operand to the multiply.
60803 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60804 return false;
60805
60806 // i16 instruction encodings are longer and some i16 instructions are slow,
60807 // so those are not desirable.
60808 if (VT == MVT::i16) {
60809 switch (Opc) {
60810 default:
60811 break;
60812 case ISD::LOAD:
60813 case ISD::SIGN_EXTEND:
60814 case ISD::ZERO_EXTEND:
60815 case ISD::ANY_EXTEND:
60816 case ISD::MUL:
60817 return false;
60818 case ISD::SHL:
60819 case ISD::SRA:
60820 case ISD::SRL:
60821 case ISD::SUB:
60822 case ISD::ADD:
60823 case ISD::AND:
60824 case ISD::OR:
60825 case ISD::XOR:
60826 // NDD instruction never has "partial register write" issue b/c it has
60827 // destination register's upper bits [63:OSIZE]) zeroed even when
60828 // OSIZE=8/16.
60829 return Subtarget.hasNDD();
60830 }
60831 }
60832
60833 // Any legal type not explicitly accounted for above here is desirable.
60834 return true;
60835}
60836
60839 int JTI,
60840 SelectionDAG &DAG) const {
60841 const Module *M = DAG.getMachineFunction().getFunction().getParent();
60842 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
60843 if (IsCFProtectionSupported) {
60844 // In case control-flow branch protection is enabled, we need to add
60845 // notrack prefix to the indirect branch.
60846 // In order to do that we create NT_BRIND SDNode.
60847 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
60848 SDValue Chain = Value;
60849 // Jump table debug info is only needed if CodeView is enabled.
60851 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
60852 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
60853 }
60854
60855 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
60856}
60857
60860 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
60862 EVT VT = LogicOp->getValueType(0);
60863 EVT OpVT = SETCC0->getOperand(0).getValueType();
60864 if (!VT.isInteger())
60866
60867 if (VT.isVector())
60872
60873 // Don't use `NotAnd` as even though `not` is generally shorter code size than
60874 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
60875 // `NotAnd` applies, `AddAnd` does as well.
60876 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
60877 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
60879}
60880
60882 EVT VT = Op.getValueType();
60883 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
60884 isa<ConstantSDNode>(Op.getOperand(1));
60885
60886 // i16 is legal, but undesirable since i16 instruction encodings are longer
60887 // and some i16 instructions are slow.
60888 // 8-bit multiply-by-constant can usually be expanded to something cheaper
60889 // using LEA and/or other ALU ops.
60890 if (VT != MVT::i16 && !Is8BitMulByConstant)
60891 return false;
60892
60893 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
60894 if (!Op.hasOneUse())
60895 return false;
60896 SDNode *User = *Op->user_begin();
60898 return false;
60899 auto *Ld = cast<LoadSDNode>(Load);
60900 auto *St = cast<StoreSDNode>(User);
60901 return Ld->getBasePtr() == St->getBasePtr();
60902 };
60903
60904 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
60905 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
60906 return false;
60907 if (!Op.hasOneUse())
60908 return false;
60909 SDNode *User = *Op->user_begin();
60910 if (User->getOpcode() != ISD::ATOMIC_STORE)
60911 return false;
60912 auto *Ld = cast<AtomicSDNode>(Load);
60913 auto *St = cast<AtomicSDNode>(User);
60914 return Ld->getBasePtr() == St->getBasePtr();
60915 };
60916
60917 auto IsFoldableZext = [](SDValue Op) {
60918 if (!Op.hasOneUse())
60919 return false;
60920 SDNode *User = *Op->user_begin();
60921 EVT VT = User->getValueType(0);
60922 return (User->getOpcode() == ISD::ZERO_EXTEND &&
60923 (VT == MVT::i32 || VT == MVT::i64));
60924 };
60925
60926 bool Commute = false;
60927 switch (Op.getOpcode()) {
60928 default: return false;
60929 case ISD::SIGN_EXTEND:
60930 case ISD::ZERO_EXTEND:
60931 case ISD::ANY_EXTEND:
60932 break;
60933 case ISD::SHL:
60934 case ISD::SRA:
60935 case ISD::SRL: {
60936 SDValue N0 = Op.getOperand(0);
60937 // Look out for (store (shl (load), x)).
60938 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
60939 return false;
60940 break;
60941 }
60942 case ISD::MUL:
60943 // When ZU is enabled, we prefer to not promote for MUL by a constant
60944 // when there is an opportunity to fold a zext with imulzu.
60945 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
60946 (isa<ConstantSDNode>(Op.getOperand(0)) ||
60947 isa<ConstantSDNode>(Op.getOperand(1))))
60948 return false;
60949 [[fallthrough]];
60950 case ISD::ADD:
60951 case ISD::AND:
60952 case ISD::OR:
60953 case ISD::XOR:
60954 Commute = true;
60955 [[fallthrough]];
60956 case ISD::SUB: {
60957 SDValue N0 = Op.getOperand(0);
60958 SDValue N1 = Op.getOperand(1);
60959 // Avoid disabling potential load folding opportunities.
60960 if (X86::mayFoldLoad(N1, Subtarget) &&
60961 (!Commute || !isa<ConstantSDNode>(N0) ||
60962 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
60963 return false;
60964 if (X86::mayFoldLoad(N0, Subtarget) &&
60965 ((Commute && !isa<ConstantSDNode>(N1)) ||
60966 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
60967 return false;
60968 if (IsFoldableAtomicRMW(N0, Op) ||
60969 (Commute && IsFoldableAtomicRMW(N1, Op)))
60970 return false;
60971 }
60972 }
60973
60974 PVT = MVT::i32;
60975 return true;
60976}
60977
60978//===----------------------------------------------------------------------===//
60979// X86 Inline Assembly Support
60980//===----------------------------------------------------------------------===//
60981
60982// Helper to match a string separated by whitespace.
60984 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
60985
60986 for (StringRef Piece : Pieces) {
60987 if (!S.starts_with(Piece)) // Check if the piece matches.
60988 return false;
60989
60990 S = S.substr(Piece.size());
60992 if (Pos == 0) // We matched a prefix.
60993 return false;
60994
60995 S = S.substr(Pos);
60996 }
60997
60998 return S.empty();
60999}
61000
61002
61003 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
61004 if (llvm::is_contained(AsmPieces, "~{cc}") &&
61005 llvm::is_contained(AsmPieces, "~{flags}") &&
61006 llvm::is_contained(AsmPieces, "~{fpsr}")) {
61007
61008 if (AsmPieces.size() == 3)
61009 return true;
61010 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
61011 return true;
61012 }
61013 }
61014 return false;
61015}
61016
61018 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
61019
61020 StringRef AsmStr = IA->getAsmString();
61021
61022 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
61023 if (!Ty || Ty->getBitWidth() % 16 != 0)
61024 return false;
61025
61026 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
61027 SmallVector<StringRef, 4> AsmPieces;
61028 SplitString(AsmStr, AsmPieces, ";\n");
61029
61030 switch (AsmPieces.size()) {
61031 default: return false;
61032 case 1:
61033 // FIXME: this should verify that we are targeting a 486 or better. If not,
61034 // we will turn this bswap into something that will be lowered to logical
61035 // ops instead of emitting the bswap asm. For now, we don't support 486 or
61036 // lower so don't worry about this.
61037 // bswap $0
61038 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
61039 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
61040 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
61041 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
61042 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
61043 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
61044 // No need to check constraints, nothing other than the equivalent of
61045 // "=r,0" would be valid here.
61047 }
61048
61049 // rorw $$8, ${0:w} --> llvm.bswap.i16
61050 if (CI->getType()->isIntegerTy(16) &&
61051 IA->getConstraintString().starts_with("=r,0,") &&
61052 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
61053 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
61054 AsmPieces.clear();
61055 StringRef ConstraintsStr = IA->getConstraintString();
61056 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
61057 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
61058 if (clobbersFlagRegisters(AsmPieces))
61060 }
61061 break;
61062 case 3:
61063 if (CI->getType()->isIntegerTy(32) &&
61064 IA->getConstraintString().starts_with("=r,0,") &&
61065 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
61066 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
61067 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
61068 AsmPieces.clear();
61069 StringRef ConstraintsStr = IA->getConstraintString();
61070 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
61071 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
61072 if (clobbersFlagRegisters(AsmPieces))
61074 }
61075
61076 if (CI->getType()->isIntegerTy(64)) {
61077 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
61078 if (Constraints.size() >= 2 &&
61079 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
61080 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
61081 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
61082 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
61083 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
61084 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
61086 }
61087 }
61088 break;
61089 }
61090 return false;
61091}
61092
61095 .Case("{@cca}", X86::COND_A)
61096 .Case("{@ccae}", X86::COND_AE)
61097 .Case("{@ccb}", X86::COND_B)
61098 .Case("{@ccbe}", X86::COND_BE)
61099 .Case("{@ccc}", X86::COND_B)
61100 .Case("{@cce}", X86::COND_E)
61101 .Case("{@ccz}", X86::COND_E)
61102 .Case("{@ccg}", X86::COND_G)
61103 .Case("{@ccge}", X86::COND_GE)
61104 .Case("{@ccl}", X86::COND_L)
61105 .Case("{@ccle}", X86::COND_LE)
61106 .Case("{@ccna}", X86::COND_BE)
61107 .Case("{@ccnae}", X86::COND_B)
61108 .Case("{@ccnb}", X86::COND_AE)
61109 .Case("{@ccnbe}", X86::COND_A)
61110 .Case("{@ccnc}", X86::COND_AE)
61111 .Case("{@ccne}", X86::COND_NE)
61112 .Case("{@ccnz}", X86::COND_NE)
61113 .Case("{@ccng}", X86::COND_LE)
61114 .Case("{@ccnge}", X86::COND_L)
61115 .Case("{@ccnl}", X86::COND_GE)
61116 .Case("{@ccnle}", X86::COND_G)
61117 .Case("{@ccno}", X86::COND_NO)
61118 .Case("{@ccnp}", X86::COND_NP)
61119 .Case("{@ccns}", X86::COND_NS)
61120 .Case("{@cco}", X86::COND_O)
61121 .Case("{@ccp}", X86::COND_P)
61122 .Case("{@ccs}", X86::COND_S)
61124 return Cond;
61125}
61126
61127/// Given a constraint letter, return the type of constraint for this target.
61130 if (Constraint.size() == 1) {
61131 switch (Constraint[0]) {
61132 case 'R':
61133 case 'q':
61134 case 'Q':
61135 case 'f':
61136 case 't':
61137 case 'u':
61138 case 'y':
61139 case 'x':
61140 case 'v':
61141 case 'l':
61142 case 'k': // AVX512 masking registers.
61143 return C_RegisterClass;
61144 case 'a':
61145 case 'b':
61146 case 'c':
61147 case 'd':
61148 case 'S':
61149 case 'D':
61150 case 'A':
61151 return C_Register;
61152 case 'I':
61153 case 'J':
61154 case 'K':
61155 case 'N':
61156 case 'G':
61157 case 'L':
61158 case 'M':
61159 return C_Immediate;
61160 case 'C':
61161 case 'e':
61162 case 'Z':
61163 return C_Other;
61164 default:
61165 break;
61166 }
61167 }
61168 else if (Constraint.size() == 2) {
61169 switch (Constraint[0]) {
61170 default:
61171 break;
61172 case 'W':
61173 if (Constraint[1] != 's')
61174 break;
61175 return C_Other;
61176 case 'Y':
61177 switch (Constraint[1]) {
61178 default:
61179 break;
61180 case 'z':
61181 return C_Register;
61182 case 'i':
61183 case 'm':
61184 case 'k':
61185 case 't':
61186 case '2':
61187 return C_RegisterClass;
61188 }
61189 break;
61190 case 'j':
61191 switch (Constraint[1]) {
61192 default:
61193 break;
61194 case 'r':
61195 case 'R':
61196 return C_RegisterClass;
61197 }
61198 }
61199 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61200 return C_Other;
61201 return TargetLowering::getConstraintType(Constraint);
61202}
61203
61204/// Examine constraint type and operand type and determine a weight value.
61205/// This object must already have been set up with the operand type
61206/// and the current alternative constraint selected.
61209 AsmOperandInfo &Info, const char *Constraint) const {
61211 Value *CallOperandVal = Info.CallOperandVal;
61212 // If we don't have a value, we can't do a match,
61213 // but allow it at the lowest weight.
61214 if (!CallOperandVal)
61215 return CW_Default;
61216 Type *Ty = CallOperandVal->getType();
61217 // Look at the constraint type.
61218 switch (*Constraint) {
61219 default:
61221 [[fallthrough]];
61222 case 'R':
61223 case 'q':
61224 case 'Q':
61225 case 'a':
61226 case 'b':
61227 case 'c':
61228 case 'd':
61229 case 'S':
61230 case 'D':
61231 case 'A':
61232 if (CallOperandVal->getType()->isIntegerTy())
61233 Wt = CW_SpecificReg;
61234 break;
61235 case 'f':
61236 case 't':
61237 case 'u':
61238 if (Ty->isFloatingPointTy())
61239 Wt = CW_SpecificReg;
61240 break;
61241 case 'y':
61242 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61243 Wt = CW_SpecificReg;
61244 break;
61245 case 'Y':
61246 if (StringRef(Constraint).size() != 2)
61247 break;
61248 switch (Constraint[1]) {
61249 default:
61250 return CW_Invalid;
61251 // XMM0
61252 case 'z':
61253 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61254 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61255 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61256 return CW_SpecificReg;
61257 return CW_Invalid;
61258 // Conditional OpMask regs (AVX512)
61259 case 'k':
61260 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61261 return CW_Register;
61262 return CW_Invalid;
61263 // Any MMX reg
61264 case 'm':
61265 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61266 return CW_SpecificReg;
61267 return CW_Invalid;
61268 // Any SSE reg when ISA >= SSE2, same as 'x'
61269 case 'i':
61270 case 't':
61271 case '2':
61272 if (!Subtarget.hasSSE2())
61273 return CW_Invalid;
61274 break;
61275 }
61276 break;
61277 case 'j':
61278 if (StringRef(Constraint).size() != 2)
61279 break;
61280 switch (Constraint[1]) {
61281 default:
61282 return CW_Invalid;
61283 case 'r':
61284 case 'R':
61285 if (CallOperandVal->getType()->isIntegerTy())
61286 Wt = CW_SpecificReg;
61287 break;
61288 }
61289 break;
61290 case 'v':
61291 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61292 Wt = CW_Register;
61293 [[fallthrough]];
61294 case 'x':
61295 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61296 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61297 Wt = CW_Register;
61298 break;
61299 case 'k':
61300 // Enable conditional vector operations using %k<#> registers.
61301 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61302 Wt = CW_Register;
61303 break;
61304 case 'I':
61305 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61306 if (C->getZExtValue() <= 31)
61307 Wt = CW_Constant;
61308 break;
61309 case 'J':
61310 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61311 if (C->getZExtValue() <= 63)
61312 Wt = CW_Constant;
61313 break;
61314 case 'K':
61315 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61316 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61317 Wt = CW_Constant;
61318 break;
61319 case 'L':
61320 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61321 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61322 Wt = CW_Constant;
61323 break;
61324 case 'M':
61325 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61326 if (C->getZExtValue() <= 3)
61327 Wt = CW_Constant;
61328 break;
61329 case 'N':
61330 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61331 if (C->getZExtValue() <= 0xff)
61332 Wt = CW_Constant;
61333 break;
61334 case 'G':
61335 case 'C':
61336 if (isa<ConstantFP>(CallOperandVal))
61337 Wt = CW_Constant;
61338 break;
61339 case 'e':
61340 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61341 if ((C->getSExtValue() >= -0x80000000LL) &&
61342 (C->getSExtValue() <= 0x7fffffffLL))
61343 Wt = CW_Constant;
61344 break;
61345 case 'Z':
61346 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61347 if (C->getZExtValue() <= 0xffffffff)
61348 Wt = CW_Constant;
61349 break;
61350 }
61351 return Wt;
61352}
61353
61354/// Try to replace an X constraint, which matches anything, with another that
61355/// has more specific requirements based on the type of the corresponding
61356/// operand.
61358LowerXConstraint(EVT ConstraintVT) const {
61359 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61360 // 'f' like normal targets.
61361 if (ConstraintVT.isFloatingPoint()) {
61362 if (Subtarget.hasSSE1())
61363 return "x";
61364 }
61365
61366 return TargetLowering::LowerXConstraint(ConstraintVT);
61367}
61368
61369// Lower @cc targets via setcc.
61371 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61372 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61374 if (Cond == X86::COND_INVALID)
61375 return SDValue();
61376 // Check that return type is valid.
61377 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61378 OpInfo.ConstraintVT.getSizeInBits() < 8)
61379 report_fatal_error("Glue output operand is of invalid type");
61380
61381 // Get EFLAGS register. Only update chain when copyfrom is glued.
61382 if (Glue.getNode()) {
61383 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61384 Chain = Glue.getValue(1);
61385 } else
61386 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61387 // Extract CC code.
61388 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61389 // Extend to 32-bits
61390 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61391
61392 return Result;
61393}
61394
61395/// Lower the specified operand into the Ops vector.
61396/// If it is invalid, don't add anything to Ops.
61398 StringRef Constraint,
61399 std::vector<SDValue> &Ops,
61400 SelectionDAG &DAG) const {
61401 SDValue Result;
61402 char ConstraintLetter = Constraint[0];
61403 switch (ConstraintLetter) {
61404 default: break;
61405 case 'I':
61406 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61407 if (C->getZExtValue() <= 31) {
61408 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61409 Op.getValueType());
61410 break;
61411 }
61412 }
61413 return;
61414 case 'J':
61415 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61416 if (C->getZExtValue() <= 63) {
61417 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61418 Op.getValueType());
61419 break;
61420 }
61421 }
61422 return;
61423 case 'K':
61424 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61425 if (isInt<8>(C->getSExtValue())) {
61426 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61427 Op.getValueType());
61428 break;
61429 }
61430 }
61431 return;
61432 case 'L':
61433 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61434 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61435 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61436 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61437 Op.getValueType());
61438 break;
61439 }
61440 }
61441 return;
61442 case 'M':
61443 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61444 if (C->getZExtValue() <= 3) {
61445 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61446 Op.getValueType());
61447 break;
61448 }
61449 }
61450 return;
61451 case 'N':
61452 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61453 if (C->getZExtValue() <= 255) {
61454 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61455 Op.getValueType());
61456 break;
61457 }
61458 }
61459 return;
61460 case 'O':
61461 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61462 if (C->getZExtValue() <= 127) {
61463 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61464 Op.getValueType());
61465 break;
61466 }
61467 }
61468 return;
61469 case 'e': {
61470 // 32-bit signed value
61471 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61473 C->getSExtValue())) {
61474 // Widen to 64 bits here to get it sign extended.
61475 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61476 break;
61477 }
61478 // FIXME gcc accepts some relocatable values here too, but only in certain
61479 // memory models; it's complicated.
61480 }
61481 return;
61482 }
61483 case 'W': {
61484 assert(Constraint[1] == 's');
61485 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61486 // offset.
61487 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61488 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61489 BA->getValueType(0)));
61490 } else {
61491 int64_t Offset = 0;
61492 if (Op->getOpcode() == ISD::ADD &&
61493 isa<ConstantSDNode>(Op->getOperand(1))) {
61494 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61495 Op = Op->getOperand(0);
61496 }
61497 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61498 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61499 GA->getValueType(0), Offset));
61500 }
61501 return;
61502 }
61503 case 'Z': {
61504 // 32-bit unsigned value
61505 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61507 C->getZExtValue())) {
61508 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61509 Op.getValueType());
61510 break;
61511 }
61512 }
61513 // FIXME gcc accepts some relocatable values here too, but only in certain
61514 // memory models; it's complicated.
61515 return;
61516 }
61517 case 'i': {
61518 // Literal immediates are always ok.
61519 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61520 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61521 BooleanContent BCont = getBooleanContents(MVT::i64);
61522 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61524 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61525 : CST->getSExtValue();
61526 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61527 break;
61528 }
61529
61530 // In any sort of PIC mode addresses need to be computed at runtime by
61531 // adding in a register or some sort of table lookup. These can't
61532 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61533 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61534 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
61535 return;
61536
61537 // If we are in non-pic codegen mode, we allow the address of a global (with
61538 // an optional displacement) to be used with 'i'.
61539 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61540 // If we require an extra load to get this address, as in PIC mode, we
61541 // can't accept it.
61543 Subtarget.classifyGlobalReference(GA->getGlobal())))
61544 return;
61545 break;
61546 }
61547 }
61548
61549 if (Result.getNode()) {
61550 Ops.push_back(Result);
61551 return;
61552 }
61553 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61554}
61555
61556/// Check if \p RC is a general purpose register class.
61557/// I.e., GR* or one of their variant.
61558static bool isGRClass(const TargetRegisterClass &RC) {
61559 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61560 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61561 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61562 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61563 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61564}
61565
61566/// Check if \p RC is a vector register class.
61567/// I.e., FR* / VR* or one of their variant.
61568static bool isFRClass(const TargetRegisterClass &RC) {
61569 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61570 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61571 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61572 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61573 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61574 RC.hasSuperClassEq(&X86::VR512RegClass);
61575}
61576
61577/// Check if \p RC is a mask register class.
61578/// I.e., VK* or one of their variant.
61579static bool isVKClass(const TargetRegisterClass &RC) {
61580 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61581 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61582 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61583 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61584 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61585 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61586 RC.hasSuperClassEq(&X86::VK64RegClass);
61587}
61588
61589static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61590 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61591}
61592
61593std::pair<unsigned, const TargetRegisterClass *>
61595 StringRef Constraint,
61596 MVT VT) const {
61597 // First, see if this is a constraint that directly corresponds to an LLVM
61598 // register class.
61599 if (Constraint.size() == 1) {
61600 // GCC Constraint Letters
61601 switch (Constraint[0]) {
61602 default: break;
61603 // 'A' means [ER]AX + [ER]DX.
61604 case 'A':
61605 if (Subtarget.is64Bit())
61606 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61607 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61608 "Expecting 64, 32 or 16 bit subtarget");
61609 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61610
61611 // TODO: Slight differences here in allocation order and leaving
61612 // RIP in the class. Do they matter any more here than they do
61613 // in the normal allocation?
61614 case 'k':
61615 if (Subtarget.hasAVX512()) {
61616 if (VT == MVT::v1i1 || VT == MVT::i1)
61617 return std::make_pair(0U, &X86::VK1RegClass);
61618 if (VT == MVT::v8i1 || VT == MVT::i8)
61619 return std::make_pair(0U, &X86::VK8RegClass);
61620 if (VT == MVT::v16i1 || VT == MVT::i16)
61621 return std::make_pair(0U, &X86::VK16RegClass);
61622 }
61623 if (Subtarget.hasBWI()) {
61624 if (VT == MVT::v32i1 || VT == MVT::i32)
61625 return std::make_pair(0U, &X86::VK32RegClass);
61626 if (VT == MVT::v64i1 || VT == MVT::i64)
61627 return std::make_pair(0U, &X86::VK64RegClass);
61628 }
61629 break;
61630 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61631 if (Subtarget.is64Bit()) {
61632 if (VT == MVT::i8 || VT == MVT::i1)
61633 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61634 ? &X86::GR8RegClass
61635 : &X86::GR8_NOREX2RegClass);
61636 if (VT == MVT::i16)
61637 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61638 ? &X86::GR16RegClass
61639 : &X86::GR16_NOREX2RegClass);
61640 if (VT == MVT::i32 || VT == MVT::f32)
61641 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61642 ? &X86::GR32RegClass
61643 : &X86::GR32_NOREX2RegClass);
61644 if (VT != MVT::f80 && !VT.isVector())
61645 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61646 ? &X86::GR64RegClass
61647 : &X86::GR64_NOREX2RegClass);
61648 break;
61649 }
61650 [[fallthrough]];
61651 // 32-bit fallthrough
61652 case 'Q': // Q_REGS
61653 if (VT == MVT::i8 || VT == MVT::i1)
61654 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61655 if (VT == MVT::i16)
61656 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61657 if (VT == MVT::i32 || VT == MVT::f32 ||
61658 (!VT.isVector() && !Subtarget.is64Bit()))
61659 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61660 if (VT != MVT::f80 && !VT.isVector())
61661 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61662 break;
61663 case 'r': // GENERAL_REGS
61664 case 'l': // INDEX_REGS
61665 if (VT == MVT::i8 || VT == MVT::i1)
61666 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61667 ? &X86::GR8RegClass
61668 : &X86::GR8_NOREX2RegClass);
61669 if (VT == MVT::i16)
61670 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61671 ? &X86::GR16RegClass
61672 : &X86::GR16_NOREX2RegClass);
61673 if (VT == MVT::i32 || VT == MVT::f32 ||
61674 (!VT.isVector() && !Subtarget.is64Bit()))
61675 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61676 ? &X86::GR32RegClass
61677 : &X86::GR32_NOREX2RegClass);
61678 if (VT != MVT::f80 && !VT.isVector())
61679 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61680 ? &X86::GR64RegClass
61681 : &X86::GR64_NOREX2RegClass);
61682 break;
61683 case 'R': // LEGACY_REGS
61684 if (VT == MVT::i8 || VT == MVT::i1)
61685 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61686 if (VT == MVT::i16)
61687 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61688 if (VT == MVT::i32 || VT == MVT::f32 ||
61689 (!VT.isVector() && !Subtarget.is64Bit()))
61690 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61691 if (VT != MVT::f80 && !VT.isVector())
61692 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61693 break;
61694 case 'f': // FP Stack registers.
61695 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61696 // value to the correct fpstack register class.
61697 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61698 return std::make_pair(0U, &X86::RFP32RegClass);
61699 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61700 return std::make_pair(0U, &X86::RFP64RegClass);
61701 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61702 return std::make_pair(0U, &X86::RFP80RegClass);
61703 break;
61704 case 'y': // MMX_REGS if MMX allowed.
61705 if (!Subtarget.hasMMX()) break;
61706 return std::make_pair(0U, &X86::VR64RegClass);
61707 case 'v':
61708 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61709 if (!Subtarget.hasSSE1()) break;
61710 bool VConstraint = (Constraint[0] == 'v');
61711
61712 switch (VT.SimpleTy) {
61713 default: break;
61714 // Scalar SSE types.
61715 case MVT::f16:
61716 if (VConstraint && Subtarget.hasFP16())
61717 return std::make_pair(0U, &X86::FR16XRegClass);
61718 break;
61719 case MVT::f32:
61720 case MVT::i32:
61721 if (VConstraint && Subtarget.hasVLX())
61722 return std::make_pair(0U, &X86::FR32XRegClass);
61723 return std::make_pair(0U, &X86::FR32RegClass);
61724 case MVT::f64:
61725 case MVT::i64:
61726 if (VConstraint && Subtarget.hasVLX())
61727 return std::make_pair(0U, &X86::FR64XRegClass);
61728 return std::make_pair(0U, &X86::FR64RegClass);
61729 case MVT::i128:
61730 if (Subtarget.is64Bit()) {
61731 if (VConstraint && Subtarget.hasVLX())
61732 return std::make_pair(0U, &X86::VR128XRegClass);
61733 return std::make_pair(0U, &X86::VR128RegClass);
61734 }
61735 break;
61736 // Vector types and fp128.
61737 case MVT::v8f16:
61738 if (!Subtarget.hasFP16())
61739 break;
61740 if (VConstraint)
61741 return std::make_pair(0U, &X86::VR128XRegClass);
61742 return std::make_pair(0U, &X86::VR128RegClass);
61743 case MVT::v8bf16:
61744 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61745 break;
61746 if (VConstraint)
61747 return std::make_pair(0U, &X86::VR128XRegClass);
61748 return std::make_pair(0U, &X86::VR128RegClass);
61749 case MVT::f128:
61750 if (!Subtarget.is64Bit())
61751 break;
61752 [[fallthrough]];
61753 case MVT::v16i8:
61754 case MVT::v8i16:
61755 case MVT::v4i32:
61756 case MVT::v2i64:
61757 case MVT::v4f32:
61758 case MVT::v2f64:
61759 if (VConstraint && Subtarget.hasVLX())
61760 return std::make_pair(0U, &X86::VR128XRegClass);
61761 return std::make_pair(0U, &X86::VR128RegClass);
61762 // AVX types.
61763 case MVT::v16f16:
61764 if (!Subtarget.hasFP16())
61765 break;
61766 if (VConstraint)
61767 return std::make_pair(0U, &X86::VR256XRegClass);
61768 return std::make_pair(0U, &X86::VR256RegClass);
61769 case MVT::v16bf16:
61770 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61771 break;
61772 if (VConstraint)
61773 return std::make_pair(0U, &X86::VR256XRegClass);
61774 return std::make_pair(0U, &X86::VR256RegClass);
61775 case MVT::v32i8:
61776 case MVT::v16i16:
61777 case MVT::v8i32:
61778 case MVT::v4i64:
61779 case MVT::v8f32:
61780 case MVT::v4f64:
61781 if (VConstraint && Subtarget.hasVLX())
61782 return std::make_pair(0U, &X86::VR256XRegClass);
61783 if (Subtarget.hasAVX())
61784 return std::make_pair(0U, &X86::VR256RegClass);
61785 break;
61786 case MVT::v32f16:
61787 if (!Subtarget.hasFP16())
61788 break;
61789 if (VConstraint)
61790 return std::make_pair(0U, &X86::VR512RegClass);
61791 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61792 case MVT::v32bf16:
61793 if (!Subtarget.hasBF16())
61794 break;
61795 if (VConstraint)
61796 return std::make_pair(0U, &X86::VR512RegClass);
61797 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61798 case MVT::v64i8:
61799 case MVT::v32i16:
61800 case MVT::v8f64:
61801 case MVT::v16f32:
61802 case MVT::v16i32:
61803 case MVT::v8i64:
61804 if (!Subtarget.hasAVX512()) break;
61805 if (VConstraint)
61806 return std::make_pair(0U, &X86::VR512RegClass);
61807 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61808 }
61809 break;
61810 }
61811 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61812 switch (Constraint[1]) {
61813 default:
61814 break;
61815 case 'i':
61816 case 't':
61817 case '2':
61818 return getRegForInlineAsmConstraint(TRI, "x", VT);
61819 case 'm':
61820 if (!Subtarget.hasMMX()) break;
61821 return std::make_pair(0U, &X86::VR64RegClass);
61822 case 'z':
61823 if (!Subtarget.hasSSE1()) break;
61824 switch (VT.SimpleTy) {
61825 default: break;
61826 // Scalar SSE types.
61827 case MVT::f16:
61828 if (!Subtarget.hasFP16())
61829 break;
61830 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61831 case MVT::f32:
61832 case MVT::i32:
61833 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61834 case MVT::f64:
61835 case MVT::i64:
61836 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61837 case MVT::v8f16:
61838 if (!Subtarget.hasFP16())
61839 break;
61840 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61841 case MVT::v8bf16:
61842 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61843 break;
61844 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61845 case MVT::f128:
61846 case MVT::v16i8:
61847 case MVT::v8i16:
61848 case MVT::v4i32:
61849 case MVT::v2i64:
61850 case MVT::v4f32:
61851 case MVT::v2f64:
61852 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61853 // AVX types.
61854 case MVT::v16f16:
61855 if (!Subtarget.hasFP16())
61856 break;
61857 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61858 case MVT::v16bf16:
61859 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61860 break;
61861 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61862 case MVT::v32i8:
61863 case MVT::v16i16:
61864 case MVT::v8i32:
61865 case MVT::v4i64:
61866 case MVT::v8f32:
61867 case MVT::v4f64:
61868 if (Subtarget.hasAVX())
61869 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61870 break;
61871 case MVT::v32f16:
61872 if (!Subtarget.hasFP16())
61873 break;
61874 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61875 case MVT::v32bf16:
61876 if (!Subtarget.hasBF16())
61877 break;
61878 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61879 case MVT::v64i8:
61880 case MVT::v32i16:
61881 case MVT::v8f64:
61882 case MVT::v16f32:
61883 case MVT::v16i32:
61884 case MVT::v8i64:
61885 if (Subtarget.hasAVX512())
61886 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61887 break;
61888 }
61889 break;
61890 case 'k':
61891 // This register class doesn't allocate k0 for masked vector operation.
61892 if (Subtarget.hasAVX512()) {
61893 if (VT == MVT::v1i1 || VT == MVT::i1)
61894 return std::make_pair(0U, &X86::VK1WMRegClass);
61895 if (VT == MVT::v8i1 || VT == MVT::i8)
61896 return std::make_pair(0U, &X86::VK8WMRegClass);
61897 if (VT == MVT::v16i1 || VT == MVT::i16)
61898 return std::make_pair(0U, &X86::VK16WMRegClass);
61899 }
61900 if (Subtarget.hasBWI()) {
61901 if (VT == MVT::v32i1 || VT == MVT::i32)
61902 return std::make_pair(0U, &X86::VK32WMRegClass);
61903 if (VT == MVT::v64i1 || VT == MVT::i64)
61904 return std::make_pair(0U, &X86::VK64WMRegClass);
61905 }
61906 break;
61907 }
61908 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61909 switch (Constraint[1]) {
61910 default:
61911 break;
61912 case 'r':
61913 if (VT == MVT::i8 || VT == MVT::i1)
61914 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61915 if (VT == MVT::i16)
61916 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61917 if (VT == MVT::i32 || VT == MVT::f32)
61918 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61919 if (VT != MVT::f80 && !VT.isVector())
61920 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61921 break;
61922 case 'R':
61923 if (VT == MVT::i8 || VT == MVT::i1)
61924 return std::make_pair(0U, &X86::GR8RegClass);
61925 if (VT == MVT::i16)
61926 return std::make_pair(0U, &X86::GR16RegClass);
61927 if (VT == MVT::i32 || VT == MVT::f32)
61928 return std::make_pair(0U, &X86::GR32RegClass);
61929 if (VT != MVT::f80 && !VT.isVector())
61930 return std::make_pair(0U, &X86::GR64RegClass);
61931 break;
61932 }
61933 }
61934
61935 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61936 return std::make_pair(0U, &X86::GR32RegClass);
61937
61938 // Use the default implementation in TargetLowering to convert the register
61939 // constraint into a member of a register class.
61940 std::pair<Register, const TargetRegisterClass*> Res;
61942
61943 // Not found as a standard register?
61944 if (!Res.second) {
61945 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61946 // to/from f80.
61947 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61948 // Map st(0) -> st(7) -> ST0
61949 if (Constraint.size() == 7 && Constraint[0] == '{' &&
61950 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
61951 Constraint[3] == '(' &&
61952 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
61953 Constraint[5] == ')' && Constraint[6] == '}') {
61954 // st(7) is not allocatable and thus not a member of RFP80. Return
61955 // singleton class in cases where we have a reference to it.
61956 if (Constraint[4] == '7')
61957 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
61958 return std::make_pair(X86::FP0 + Constraint[4] - '0',
61959 &X86::RFP80RegClass);
61960 }
61961
61962 // GCC allows "st(0)" to be called just plain "st".
61963 if (StringRef("{st}").equals_insensitive(Constraint))
61964 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
61965 }
61966
61967 // flags -> EFLAGS
61968 if (StringRef("{flags}").equals_insensitive(Constraint))
61969 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
61970
61971 // dirflag -> DF
61972 // Only allow for clobber.
61973 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
61974 VT == MVT::Other)
61975 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
61976
61977 // fpsr -> FPSW
61978 // Only allow for clobber.
61979 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
61980 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
61981
61982 return Res;
61983 }
61984
61985 // Make sure it isn't a register that requires 64-bit mode.
61986 if (!Subtarget.is64Bit() &&
61987 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
61988 TRI->getEncodingValue(Res.first) >= 8) {
61989 // Register requires REX prefix, but we're in 32-bit mode.
61990 return std::make_pair(0, nullptr);
61991 }
61992
61993 // Make sure it isn't a register that requires AVX512.
61994 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
61995 TRI->getEncodingValue(Res.first) & 0x10) {
61996 // Register requires EVEX prefix.
61997 return std::make_pair(0, nullptr);
61998 }
61999
62000 // Otherwise, check to see if this is a register class of the wrong value
62001 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62002 // turn into {ax},{dx}.
62003 // MVT::Other is used to specify clobber names.
62004 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62005 return Res; // Correct type already, nothing to do.
62006
62007 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62008 // return "eax". This should even work for things like getting 64bit integer
62009 // registers when given an f64 type.
62010 const TargetRegisterClass *Class = Res.second;
62011 // The generic code will match the first register class that contains the
62012 // given register. Thus, based on the ordering of the tablegened file,
62013 // the "plain" GR classes might not come first.
62014 // Therefore, use a helper method.
62015 if (isGRClass(*Class)) {
62016 unsigned Size = VT.getSizeInBits();
62017 if (Size == 1) Size = 8;
62018 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62019 return std::make_pair(0, nullptr);
62020 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62021 if (DestReg.isValid()) {
62022 bool is64Bit = Subtarget.is64Bit();
62023 const TargetRegisterClass *RC =
62024 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62025 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62026 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62027 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62028 if (Size == 64 && !is64Bit) {
62029 // Model GCC's behavior here and select a fixed pair of 32-bit
62030 // registers.
62031 switch (DestReg) {
62032 case X86::RAX:
62033 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62034 case X86::RDX:
62035 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62036 case X86::RCX:
62037 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62038 case X86::RBX:
62039 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62040 case X86::RSI:
62041 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62042 case X86::RDI:
62043 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62044 case X86::RBP:
62045 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62046 default:
62047 return std::make_pair(0, nullptr);
62048 }
62049 }
62050 if (RC && RC->contains(DestReg))
62051 return std::make_pair(DestReg, RC);
62052 return Res;
62053 }
62054 // No register found/type mismatch.
62055 return std::make_pair(0, nullptr);
62056 } else if (isFRClass(*Class)) {
62057 // Handle references to XMM physical registers that got mapped into the
62058 // wrong class. This can happen with constraints like {xmm0} where the
62059 // target independent register mapper will just pick the first match it can
62060 // find, ignoring the required type.
62061
62062 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62063 if (VT == MVT::f16)
62064 Res.second = &X86::FR16XRegClass;
62065 else if (VT == MVT::f32 || VT == MVT::i32)
62066 Res.second = &X86::FR32XRegClass;
62067 else if (VT == MVT::f64 || VT == MVT::i64)
62068 Res.second = &X86::FR64XRegClass;
62069 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62070 Res.second = &X86::VR128XRegClass;
62071 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62072 Res.second = &X86::VR256XRegClass;
62073 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62074 Res.second = &X86::VR512RegClass;
62075 else {
62076 // Type mismatch and not a clobber: Return an error;
62077 Res.first = 0;
62078 Res.second = nullptr;
62079 }
62080 } else if (isVKClass(*Class)) {
62081 if (VT == MVT::v1i1 || VT == MVT::i1)
62082 Res.second = &X86::VK1RegClass;
62083 else if (VT == MVT::v8i1 || VT == MVT::i8)
62084 Res.second = &X86::VK8RegClass;
62085 else if (VT == MVT::v16i1 || VT == MVT::i16)
62086 Res.second = &X86::VK16RegClass;
62087 else if (VT == MVT::v32i1 || VT == MVT::i32)
62088 Res.second = &X86::VK32RegClass;
62089 else if (VT == MVT::v64i1 || VT == MVT::i64)
62090 Res.second = &X86::VK64RegClass;
62091 else {
62092 // Type mismatch and not a clobber: Return an error;
62093 Res.first = 0;
62094 Res.second = nullptr;
62095 }
62096 }
62097
62098 return Res;
62099}
62100
62102 // Integer division on x86 is expensive. However, when aggressively optimizing
62103 // for code size, we prefer to use a div instruction, as it is usually smaller
62104 // than the alternative sequence.
62105 // The exception to this is vector division. Since x86 doesn't have vector
62106 // integer division, leaving the division as-is is a loss even in terms of
62107 // size, because it will have to be scalarized, while the alternative code
62108 // sequence can be performed in vector form.
62109 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62110 return OptSize && !VT.isVector();
62111}
62112
62113void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62114 if (!Subtarget.is64Bit())
62115 return;
62116
62117 // Update IsSplitCSR in X86MachineFunctionInfo.
62119 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62120 AFI->setIsSplitCSR(true);
62121}
62122
62123void X86TargetLowering::insertCopiesSplitCSR(
62124 MachineBasicBlock *Entry,
62125 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62126 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62127 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62128 if (!IStart)
62129 return;
62130
62131 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62132 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62133 MachineBasicBlock::iterator MBBI = Entry->begin();
62134 for (const MCPhysReg *I = IStart; *I; ++I) {
62135 const TargetRegisterClass *RC = nullptr;
62136 if (X86::GR64RegClass.contains(*I))
62137 RC = &X86::GR64RegClass;
62138 else
62139 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62140
62141 Register NewVR = MRI->createVirtualRegister(RC);
62142 // Create copy from CSR to a virtual register.
62143 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62144 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62145 // nounwind. If we want to generalize this later, we may need to emit
62146 // CFI pseudo-instructions.
62147 assert(
62148 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62149 "Function should be nounwind in insertCopiesSplitCSR!");
62150 Entry->addLiveIn(*I);
62151 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62152 .addReg(*I);
62153
62154 // Insert the copy-back instructions right before the terminator.
62155 for (auto *Exit : Exits)
62156 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62157 TII->get(TargetOpcode::COPY), *I)
62158 .addReg(NewVR);
62159 }
62160}
62161
62163 return Subtarget.is64Bit();
62164}
62165
62169 const TargetInstrInfo *TII) const {
62170 assert(MBBI->isCall() && MBBI->getCFIType() &&
62171 "Invalid call instruction for a KCFI check");
62172
62173 MachineFunction &MF = *MBB.getParent();
62174 // If the call target is a memory operand, unfold it and use R11 for the
62175 // call, so KCFI_CHECK won't have to recompute the address.
62176 switch (MBBI->getOpcode()) {
62177 case X86::CALL64m:
62178 case X86::CALL64m_NT:
62179 case X86::TAILJMPm64:
62180 case X86::TAILJMPm64_REX: {
62183 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62184 /*UnfoldStore=*/false, NewMIs))
62185 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62186 for (auto *NewMI : NewMIs)
62187 MBBI = MBB.insert(OrigCall, NewMI);
62188 assert(MBBI->isCall() &&
62189 "Unexpected instruction after memory operand unfolding");
62190 if (OrigCall->shouldUpdateAdditionalCallInfo())
62191 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62192 MBBI->setCFIType(MF, OrigCall->getCFIType());
62193 OrigCall->eraseFromParent();
62194 break;
62195 }
62196 default:
62197 break;
62198 }
62199
62200 MachineOperand &Target = MBBI->getOperand(0);
62201 Register TargetReg;
62202 switch (MBBI->getOpcode()) {
62203 case X86::CALL64r:
62204 case X86::CALL64r_ImpCall:
62205 case X86::CALL64r_NT:
62206 case X86::TAILJMPr64:
62207 case X86::TAILJMPr64_REX:
62208 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62209 Target.setIsRenamable(false);
62210 TargetReg = Target.getReg();
62211 break;
62212 case X86::CALL64pcrel32:
62213 case X86::TAILJMPd64:
62214 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62215 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62216 // 64-bit indirect thunk calls.
62217 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62218 "Unexpected register for an indirect thunk call");
62219 TargetReg = X86::R11;
62220 break;
62221 default:
62222 llvm_unreachable("Unexpected CFI call opcode");
62223 break;
62224 }
62225
62226 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62227 .addReg(TargetReg)
62228 .addImm(MBBI->getCFIType())
62229 .getInstr();
62230}
62231
62232/// Returns true if stack probing through a function call is requested.
62234 return !getStackProbeSymbolName(MF).empty();
62235}
62236
62237/// Returns true if stack probing through inline assembly is requested.
62239
62240 // No inline stack probe for Windows, they have their own mechanism.
62241 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62242 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62243 return false;
62244
62245 // If the function specifically requests inline stack probes, emit them.
62246 if (MF.getFunction().hasFnAttribute("probe-stack"))
62247 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62248 "inline-asm";
62249
62250 return false;
62251}
62252
62253/// Returns the name of the symbol used to emit stack probes or the empty
62254/// string if not applicable.
62257 // Inline Stack probes disable stack probe call
62258 if (hasInlineStackProbe(MF))
62259 return "";
62260
62261 // If the function specifically requests stack probes, emit them.
62262 if (MF.getFunction().hasFnAttribute("probe-stack"))
62263 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62264
62265 // Generally, if we aren't on Windows, the platform ABI does not include
62266 // support for stack probes, so don't emit them.
62267 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62268 Subtarget.isTargetMachO() ||
62269 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62270 return "";
62271
62272 // We need a stack probe to conform to the Windows ABI. Choose the right
62273 // symbol.
62274 if (Subtarget.is64Bit())
62275 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62276 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62277}
62278
62279unsigned
62281 // The default stack probe size is 4096 if the function has no stackprobesize
62282 // attribute.
62283 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62284 4096);
62285}
62286
62288 if (ML && ML->isInnermost() &&
62289 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62292}
unsigned const MachineRegisterInfo * MRI
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:68
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
uint64_t Addr
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:546
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
#define R2(n)
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition: Debug.h:119
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:6082
void clearSign()
Definition: APFloat.h:1298
opStatus next(bool nextDown)
Definition: APFloat.h:1254
void changeSign()
Definition: APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1079
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
APInt abs() const
Get the absolute value.
Definition: APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1079
int32_t exactLogBase2() const
Definition: APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
unsigned countTrailingZeros() const
Definition: APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1435
unsigned logBase2() const
Definition: APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:471
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:206
iterator begin() const
Definition: ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:657
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:843
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:721
@ Add
*p = old + v
Definition: Instructions.h:725
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:777
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ Sub
*p = old - v
Definition: Instructions.h:727
@ And
*p = old & v
Definition: Instructions.h:729
@ Xor
*p = old ^ v
Definition: Instructions.h:735
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:781
@ FSub
*p = old - v
Definition: Instructions.h:749
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:769
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:773
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
Value * getPointerOperand()
Definition: Instructions.h:886
BinOp getOperation() const
Definition: Instructions.h:819
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:877
Value * getValOperand()
Definition: Instructions.h:890
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:863
This is an SDNode representing atomic operations.
LLVM_ABI bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:400
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool any() const
any - Returns true if any bit is set.
Definition: BitVector.h:170
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:899
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
Value * getCalledOperand() const
Definition: InstrTypes.h:1340
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1314
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:3005
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1602
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:403
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:435
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
unsigned size() const
Definition: DenseMap.h:120
bool empty() const
Definition: DenseMap.h:119
iterator begin()
Definition: DenseMap.h:78
iterator end()
Definition: DenseMap.h:87
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Tagged union holding either a T or a Error.
Definition: Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:128
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1036
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:569
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:424
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:273
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
This class is used to form a handle around another node that is persistent and is updated across invo...
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:123
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:171
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:82
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:180
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:245
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:253
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:258
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:317
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
unsigned succ_size() const
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:352
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:372
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:995
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
Definition: SelectionDAG.h:941
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
Definition: SelectionDAG.h:963
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
Definition: SelectionDAG.h:956
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI std::optional< uint64_t > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:459
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:506
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:808
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:885
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVM_ABI std::optional< uint64_t > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:777
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
Definition: SelectionDAG.h:918
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:979
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
void reserve(size_type NewNumEntries)
Definition: SmallPtrSet.h:117
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
size_type size() const
Definition: SmallSet.h:171
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705
iterator erase(const_iterator CI)
Definition: SmallVector.h:738
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:579
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:287
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:581
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:269
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:151
size_t size_type
Definition: StringRef.h:61
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:281
static constexpr size_t npos
Definition: StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:180
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:252
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:719
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition: Triple.h:771
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, DriverKit, XROS, or bridgeOS).
Definition: Triple.h:608
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
LLVM_ABI uint64_t getArrayNumElements() const
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:35
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
use_iterator use_begin()
Definition: Value.h:364
bool use_empty() const
Definition: Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
iterator_range< use_iterator > uses()
Definition: Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:232
bool hasAnyFMA() const
Definition: X86Subtarget.h:199
bool isOSWindows() const
Definition: X86Subtarget.h:325
bool isTargetMachO() const
Definition: X86Subtarget.h:292
bool isUEFI() const
Definition: X86Subtarget.h:323
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:217
bool hasSSE1() const
Definition: X86Subtarget.h:189
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
Definition: X86Subtarget.h:280
bool hasBitScanPassThrough() const
Definition: X86Subtarget.h:265
bool isPICStyleGOT() const
Definition: X86Subtarget.h:333
bool hasSSE42() const
Definition: X86Subtarget.h:194
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:118
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:277
bool canUseCMOV() const
Definition: X86Subtarget.h:188
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:336
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:301
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:181
bool isTargetDarwin() const
Definition: X86Subtarget.h:284
bool isTargetWin64() const
Definition: X86Subtarget.h:329
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:176
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:282
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:122
bool useAVX512Regs() const
Definition: X86Subtarget.h:249
bool hasSSE3() const
Definition: X86Subtarget.h:191
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:342
bool hasAVX512() const
Definition: X86Subtarget.h:197
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:228
bool hasSSE41() const
Definition: X86Subtarget.h:193
bool isTargetELF() const
Definition: X86Subtarget.h:290
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:205
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:182
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:190
bool hasSSSE3() const
Definition: X86Subtarget.h:192
bool hasInt256() const
Definition: X86Subtarget.h:198
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:334
bool isTargetCygMing() const
Definition: X86Subtarget.h:321
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:288
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:132
bool hasAVX() const
Definition: X86Subtarget.h:195
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:313
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:221
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:317
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:124
bool useBWIRegs() const
Definition: X86Subtarget.h:258
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:196
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:3009
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:256
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:45
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1401
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1379
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1265
@ ConstantFP
Definition: ISDOpcodes.h:87
@ STRICT_FATAN2
Definition: ISDOpcodes.h:441
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1381
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ STRICT_FCEIL
Definition: ISDOpcodes.h:454
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1382
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:140
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1112
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
@ STRICT_FTANH
Definition: ISDOpcodes.h:444
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
Definition: ISDOpcodes.h:1098
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:1020
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:167
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1364
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1338
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1343
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:505
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ STRICT_FLOG2
Definition: ISDOpcodes.h:449
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1309
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1568
@ GlobalTLSAddress
Definition: ISDOpcodes.h:89
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1212
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:151
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ STRICT_FASIN
Definition: ISDOpcodes.h:438
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:117
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1157
@ STRICT_FATAN
Definition: ISDOpcodes.h:440
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:809
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1380
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1432
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:663
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1261
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:458
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:988
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:987
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:463
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:452
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:928
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:453
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ STRICT_FSINH
Definition: ISDOpcodes.h:442
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1383
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:130
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1325
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ ConstantPool
Definition: ISDOpcodes.h:92
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ STRICT_FROUND
Definition: ISDOpcodes.h:456
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:477
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1413
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:996
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:455
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:457
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:994
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:110
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1117
@ STRICT_FCOSH
Definition: ISDOpcodes.h:443
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:997
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:420
@ STRICT_FLOG10
Definition: ISDOpcodes.h:448
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ STRICT_FEXP2
Definition: ISDOpcodes.h:446
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ ExternalSymbol
Definition: ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:690
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:903
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1256
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ BlockAddress
Definition: ISDOpcodes.h:94
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1433
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ STRICT_FRINT
Definition: ISDOpcodes.h:450
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1122
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:713
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1315
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ STRICT_FACOS
Definition: ISDOpcodes.h:439
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1761
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1756
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1572
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1743
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1718
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1685
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1665
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1724
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:980
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:217
@ FS
Definition: X86.h:214
@ PTR64
Definition: X86.h:218
@ PTR32_SPTR
Definition: X86.h:216
@ GS
Definition: X86.h:213
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:411
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:391
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:488
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:450
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:432
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:456
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:438
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:476
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:403
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:363
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:472
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:460
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:425
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:480
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:444
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:419
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:387
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double e
Definition: MathExtras.h:47
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:121
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:477
@ Length
Definition: DWP.cpp:477
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:139
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1605
@ SM_SentinelUndef
@ SM_SentinelZero
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:355
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:270
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition: STLExtras.h:2095
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1587
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:295
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1796
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
unsigned M1(unsigned Val)
Definition: VE.h:377
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2013
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1879
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:376
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
const char * toString(DWARFSectionKind Kind)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2127
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ TRUNCATE2_TO_REG
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1629
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
#define EQ(a, b)
Definition: regexec.c:112
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:299
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition: KnownBits.cpp:764
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:487
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:179
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:101
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:235
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition: KnownBits.h:267
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:154
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:282
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:165
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition: KnownBits.h:104
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:228
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:218
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:304
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:189
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:241
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:138
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:98
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:803
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:525
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.