LLVM 21.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
82#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157/// Value type used for condition codes.
158static const MVT MVT_CC = MVT::i32;
159
160static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
163static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
166
168
170
171static inline EVT getPackedSVEVectorVT(EVT VT) {
172 switch (VT.getSimpleVT().SimpleTy) {
173 default:
174 llvm_unreachable("unexpected element type for vector");
175 case MVT::i8:
176 return MVT::nxv16i8;
177 case MVT::i16:
178 return MVT::nxv8i16;
179 case MVT::i32:
180 return MVT::nxv4i32;
181 case MVT::i64:
182 return MVT::nxv2i64;
183 case MVT::f16:
184 return MVT::nxv8f16;
185 case MVT::f32:
186 return MVT::nxv4f32;
187 case MVT::f64:
188 return MVT::nxv2f64;
189 case MVT::bf16:
190 return MVT::nxv8bf16;
191 }
192}
193
194// NOTE: Currently there's only a need to return integer vector types. If this
195// changes then just add an extra "type" parameter.
197 switch (EC.getKnownMinValue()) {
198 default:
199 llvm_unreachable("unexpected element count for vector");
200 case 16:
201 return MVT::nxv16i8;
202 case 8:
203 return MVT::nxv8i16;
204 case 4:
205 return MVT::nxv4i32;
206 case 2:
207 return MVT::nxv2i64;
208 }
209}
210
212 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213 "Expected scalable predicate vector type!");
214 switch (VT.getVectorMinNumElements()) {
215 default:
216 llvm_unreachable("unexpected element count for vector");
217 case 2:
218 return MVT::nxv2i64;
219 case 4:
220 return MVT::nxv4i32;
221 case 8:
222 return MVT::nxv8i16;
223 case 16:
224 return MVT::nxv16i8;
225 }
226}
227
228/// Returns true if VT's elements occupy the lowest bit positions of its
229/// associated register class without any intervening space.
230///
231/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232/// same register class, but only nxv8f16 can be treated as a packed vector.
233static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 "Expected legal vector type!");
236 return VT.isFixedLengthVector() ||
238}
239
240// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241// predicate and end with a passthru value matching the result type.
242static bool isMergePassthruOpcode(unsigned Opc) {
243 switch (Opc) {
244 default:
245 return false;
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
289 return true;
291 switch (Op.getConstantOperandVal(0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(1);
354 ConstDisc = Disc->getOperand(2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370
371 return std::make_tuple(
372 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373 AddrDisc);
374}
375
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
385
386 // Set up the register classes.
387 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
416
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440
441 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447
448 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
455 addRegisterClass(VT, &AArch64::ZPRRegClass);
456
459 addRegisterClass(VT, &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467
468 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470 }
471
472 // Compute derived properties from the register classes
474
475 // Provide all sorts of operation actions
515
517
521
525
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
532
535
536 // Virtually no operation on f128 is legal, but LLVM can't expand them when
537 // there's a valid register class, so we need custom operations in most cases.
562 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
563 // aren't handled.
564
565 // Lowering for many of the conversions is actually specified by the non-f128
566 // type. The LowerXXX function will be trivial when f128 isn't involved.
591 if (Subtarget->hasFPARMv8()) {
594 }
597 if (Subtarget->hasFPARMv8()) {
600 }
603
608
609 // Variable arguments.
614
615 // Variable-sized objects.
618
619 // Lowering Funnel Shifts to EXTR
624
626
627 // Constant pool entries
629
630 // BlockAddress
632
633 // AArch64 lacks both left-rotate and popcount instructions.
639 }
640
641 // AArch64 doesn't have i32 MULH{S|U}.
644
645 // AArch64 doesn't have {U|S}MUL_LOHI.
650
651 if (Subtarget->hasCSSC()) {
655
657
661
664
669
674 } else {
678
681
684 }
685
691 }
698
699 // Custom lower Add/Sub/Mul with overflow.
712
721
730 if (Subtarget->hasFullFP16()) {
733 } else {
736 }
737
738 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
751 setOperationAction(Op, MVT::f16, Promote);
752 setOperationAction(Op, MVT::v4f16, Expand);
753 setOperationAction(Op, MVT::v8f16, Expand);
754 setOperationAction(Op, MVT::bf16, Promote);
755 setOperationAction(Op, MVT::v4bf16, Expand);
756 setOperationAction(Op, MVT::v8bf16, Expand);
757 }
758
759 // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
766
767 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
768 for (auto Op : {
772 ISD::FADD,
773 ISD::FSUB,
774 ISD::FMUL,
775 ISD::FDIV,
776 ISD::FMA,
807 })
808 setOperationAction(Op, ScalarVT, Promote);
809
810 for (auto Op : {ISD::FNEG, ISD::FABS})
811 setOperationAction(Op, ScalarVT, Legal);
812
813 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
814 // because the result type is integer.
818 setOperationAction(Op, ScalarVT, Custom);
819
820 // promote v4f16 to v4f32 when that is known to be safe.
821 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
822 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
823 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
824 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
825 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
826 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
827 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
828 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
829 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
830 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
831 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
832 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
833 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
834
844
845 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
867 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
868 };
869
870 if (!Subtarget->hasFullFP16()) {
871 LegalizeNarrowFP(MVT::f16);
872 }
873 LegalizeNarrowFP(MVT::bf16);
876
877 // AArch64 has implementations of a lot of rounding-like FP operations.
878 // clang-format off
879 for (auto Op :
891 for (MVT Ty : {MVT::f32, MVT::f64})
893 if (Subtarget->hasFullFP16())
894 setOperationAction(Op, MVT::f16, Legal);
895 }
896 // clang-format on
897
898 // Basic strict FP operations are legal
901 for (MVT Ty : {MVT::f32, MVT::f64})
903 if (Subtarget->hasFullFP16())
904 setOperationAction(Op, MVT::f16, Legal);
905 }
906
908
914
916 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
919 } else {
922 }
925
926 // Generate outline atomics library calls only if LSE was not specified for
927 // subtarget
928 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
954#define LCALLNAMES(A, B, N) \
955 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
956 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
957 setLibcallName(A##N##_REL, #B #N "_rel"); \
958 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
959#define LCALLNAME4(A, B) \
960 LCALLNAMES(A, B, 1) \
961 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
962#define LCALLNAME5(A, B) \
963 LCALLNAMES(A, B, 1) \
964 LCALLNAMES(A, B, 2) \
965 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
966 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
967 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
968 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
969 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
970 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
971 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
972#undef LCALLNAMES
973#undef LCALLNAME4
974#undef LCALLNAME5
975 }
976
977 if (Subtarget->hasLSE128()) {
978 // Custom lowering because i128 is not legal. Must be replaced by 2x64
979 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
983 }
984
985 // 128-bit loads and stores can be done without expanding
988
989 // Aligned 128-bit loads and stores are single-copy atomic according to the
990 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
991 if (Subtarget->hasLSE2()) {
994 }
995
996 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
997 // custom lowering, as there are no un-paired non-temporal stores and
998 // legalization will break up 256 bit inputs.
1000 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1001 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1002 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1003 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1004 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1005 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1006 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1007
1008 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1009 // custom lowering, as there are no un-paired non-temporal loads legalization
1010 // will break up 256 bit inputs.
1011 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1012 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1013 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1014 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1015 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1016 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1017 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1018 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1019
1020 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1022
1023 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1024 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1025 // Issue __sincos_stret if available.
1028 } else {
1031 }
1032
1033 // Make floating-point constants legal for the large code model, so they don't
1034 // become loads from the constant pool.
1035 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1038 }
1039
1040 // AArch64 does not have floating-point extending loads, i1 sign-extending
1041 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1042 for (MVT VT : MVT::fp_valuetypes()) {
1043 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1044 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1045 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1046 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1047 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1048 }
1049 for (MVT VT : MVT::integer_valuetypes())
1050 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1051
1052 for (MVT WideVT : MVT::fp_valuetypes()) {
1053 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1054 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1055 setTruncStoreAction(WideVT, NarrowVT, Expand);
1056 }
1057 }
1058 }
1059
1060 if (Subtarget->hasFPARMv8()) {
1064 }
1065
1066 // Indexed loads and stores are supported.
1067 for (unsigned im = (unsigned)ISD::PRE_INC;
1069 setIndexedLoadAction(im, MVT::i8, Legal);
1070 setIndexedLoadAction(im, MVT::i16, Legal);
1071 setIndexedLoadAction(im, MVT::i32, Legal);
1072 setIndexedLoadAction(im, MVT::i64, Legal);
1073 setIndexedLoadAction(im, MVT::f64, Legal);
1074 setIndexedLoadAction(im, MVT::f32, Legal);
1075 setIndexedLoadAction(im, MVT::f16, Legal);
1076 setIndexedLoadAction(im, MVT::bf16, Legal);
1077 setIndexedStoreAction(im, MVT::i8, Legal);
1078 setIndexedStoreAction(im, MVT::i16, Legal);
1079 setIndexedStoreAction(im, MVT::i32, Legal);
1080 setIndexedStoreAction(im, MVT::i64, Legal);
1081 setIndexedStoreAction(im, MVT::f64, Legal);
1082 setIndexedStoreAction(im, MVT::f32, Legal);
1083 setIndexedStoreAction(im, MVT::f16, Legal);
1084 setIndexedStoreAction(im, MVT::bf16, Legal);
1085 }
1086
1087 // Trap.
1088 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1091
1092 // We combine OR nodes for bitfield operations.
1094 // Try to create BICs for vector ANDs.
1096
1097 // llvm.init.trampoline and llvm.adjust.trampoline
1100
1101 // Vector add and sub nodes may conceal a high-half opportunity.
1102 // Also, try to fold ADD into CSINC/CSINV..
1105
1108
1109 // Try and combine setcc with csel
1111
1113
1120
1122
1124
1126
1130
1133
1135
1137
1139
1143
1145
1147
1148 // In case of strict alignment, avoid an excessive number of byte wide stores.
1151 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1152
1156 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1157
1160 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1161
1164 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1165
1167
1169
1170 EnableExtLdPromotion = true;
1171
1172 // Set required alignment.
1174 // Set preferred alignments.
1175
1176 // Don't align loops on Windows. The SEH unwind info generation needs to
1177 // know the exact length of functions before the alignments have been
1178 // expanded.
1179 if (!Subtarget->isTargetWindows())
1183
1184 // Only change the limit for entries in a jump table if specified by
1185 // the sub target, but not at the command line.
1186 unsigned MaxJT = STI.getMaximumJumpTableSize();
1187 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1189
1191
1193
1195 if (Subtarget->hasSME())
1197
1198 if (Subtarget->isNeonAvailable()) {
1199 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1200 // silliness like this:
1201 // clang-format off
1202 for (auto Op :
1223 setOperationAction(Op, MVT::v1f64, Expand);
1224 // clang-format on
1225
1226 for (auto Op :
1231 setOperationAction(Op, MVT::v1i64, Expand);
1232
1233 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1234 // elements smaller than i32, so promote the input to i32 first.
1235 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1236 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1237
1238 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1239 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1240 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1243 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1245
1246 if (Subtarget->hasFullFP16()) {
1249
1258 } else {
1259 // when AArch64 doesn't have fullfp16 support, promote the input
1260 // to i32 first.
1261 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1262 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1263 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1264 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1265 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1266 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1267 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1268 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1269 }
1270
1271 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1272 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1279 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1284 }
1285
1286 // Custom handling for some quad-vector types to detect MULL.
1287 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1288 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1289 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1290 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1291 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1292 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1293
1294 // Saturates
1295 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1296 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1301 }
1302
1303 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1304 MVT::v4i32}) {
1311 }
1312
1313 // Vector reductions
1314 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1315 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1316 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1321
1323 }
1324 }
1325 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1326 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1335 }
1340
1342 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1343 // Likewise, narrowing and extending vector loads/stores aren't handled
1344 // directly.
1347
1348 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1351 } else {
1354 }
1357
1360
1361 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1362 setTruncStoreAction(VT, InnerVT, Expand);
1363 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1364 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1365 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1366 }
1367 }
1368
1369 for (auto Op :
1375 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1377 if (Subtarget->hasFullFP16())
1378 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1380 }
1381
1382 // LRINT and LLRINT.
1383 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1384 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1386 if (Subtarget->hasFullFP16())
1387 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1389 }
1390
1391 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1392
1397
1401
1402 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1403 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1404 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1405 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1406 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1407 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1408
1409 // ADDP custom lowering
1410 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1412 // FADDP custom lowering
1413 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1415 } else /* !isNeonAvailable */ {
1417 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1419
1420 if (VT.is128BitVector() || VT.is64BitVector()) {
1424 Subtarget->isLittleEndian() ? Legal : Expand);
1425 }
1426 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1427 setTruncStoreAction(VT, InnerVT, Expand);
1428 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1429 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1430 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1431 }
1432 }
1433 }
1434
1435 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1439 }
1440
1441 if (Subtarget->hasSME()) {
1443 }
1444
1445 // FIXME: Move lowering for more nodes here if those are common between
1446 // SVE and SME.
1447 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1448 for (auto VT :
1449 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1454 }
1455 }
1456
1457 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1458 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1499
1505
1514
1519
1520 if (!Subtarget->isLittleEndian())
1522
1523 if (Subtarget->hasSVE2() ||
1524 (Subtarget->hasSME() && Subtarget->isStreaming()))
1525 // For SLI/SRI.
1527 }
1528
1529 // Illegal unpacked integer vector types.
1530 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1533 }
1534
1535 // Type legalize unpacked bitcasts.
1536 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1538
1539 for (auto VT :
1540 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1541 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1543
1544 for (auto VT :
1545 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1553
1557
1558 // There are no legal MVT::nxv16f## based types.
1559 if (VT != MVT::nxv16i1) {
1562 }
1563 }
1564
1565 // NEON doesn't support masked loads/stores, but SME and SVE do.
1566 for (auto VT :
1567 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1568 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1569 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1572 }
1573
1574 // Firstly, exclude all scalable vector extending loads/truncating stores,
1575 // include both integer and floating scalable vector.
1577 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1578 setTruncStoreAction(VT, InnerVT, Expand);
1579 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1580 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1581 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1582 }
1583 }
1584
1585 // Then, selectively enable those which we directly support.
1586 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1587 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1588 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1589 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1590 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1591 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1592 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1593 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1594 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1595 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1596 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1597 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1598 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1599 }
1600
1601 // SVE supports truncating stores of 64 and 128-bit vectors
1602 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1603 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1604 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1605 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1606 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1607
1608 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1609 MVT::nxv4f32, MVT::nxv2f64}) {
1649
1671
1683 }
1684
1685 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1696
1697 if (Subtarget->hasSVEB16B16()) {
1706 }
1707 }
1708
1709 for (auto Opcode :
1712 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1713 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1714 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1715 }
1716
1717 if (!Subtarget->hasSVEB16B16()) {
1718 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1720 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1721 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1722 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1723 }
1724 }
1725
1728
1729 // NEON doesn't support integer divides, but SVE does
1730 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1731 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1734 }
1735
1736 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1737 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1738 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1739
1740 // NOTE: Currently this has to happen after computeRegisterProperties rather
1741 // than the preferred option of combining it with the addRegisterClass call.
1742 if (Subtarget->useSVEForFixedLengthVectors()) {
1745 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1746 addTypeForFixedLengthSVE(VT);
1747 }
1750 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1751 addTypeForFixedLengthSVE(VT);
1752 }
1753
1754 // 64bit results can mean a bigger than NEON input.
1755 for (auto VT : {MVT::v8i8, MVT::v4i16})
1758
1759 // 128bit results imply a bigger than NEON input.
1760 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1762 for (auto VT : {MVT::v8f16, MVT::v4f32})
1764
1765 // These operations are not supported on NEON but SVE can do them.
1767 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1768 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1769 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1770 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1771 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1772 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1773 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1774 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1775 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1776 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1777 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1778 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1779 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1780 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1781 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1786
1787 // Int operations with no NEON support.
1788 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1789 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1797 }
1798
1799 // Use SVE for vectors with more than 2 elements.
1800 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1802 }
1803
1804 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1805 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1806 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1807 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1808
1810
1811 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1813 }
1814
1815 // Handle operations that are only available in non-streaming SVE mode.
1816 if (Subtarget->isSVEAvailable()) {
1817 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1818 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1819 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1820 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1821 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1822 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1823 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1826 }
1827
1828 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1829 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1830 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1832
1833 // We can lower types that have <vscale x {2|4}> elements to compact.
1834 for (auto VT :
1835 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1836 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1838
1839 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1840 // NEON vectors in the lowest bits of the SVE register.
1841 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1842 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1844
1845 // Histcnt is SVE2 only
1846 if (Subtarget->hasSVE2()) {
1848 Custom);
1850 Custom);
1851 }
1852 }
1853
1854
1855 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1856 // Only required for llvm.aarch64.mops.memset.tag
1858 }
1859
1861
1862 if (Subtarget->hasSVE()) {
1867 }
1868
1869 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1870
1871 IsStrictFPEnabled = true;
1873
1874 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1875 // it, but it's just a wrapper around ldexp.
1876 if (Subtarget->isTargetWindows()) {
1878 if (isOperationExpand(Op, MVT::f32))
1879 setOperationAction(Op, MVT::f32, Promote);
1880 }
1881
1882 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1883 // isn't legal.
1885 if (isOperationExpand(Op, MVT::f16))
1886 setOperationAction(Op, MVT::f16, Promote);
1887
1888 if (Subtarget->isWindowsArm64EC()) {
1889 // FIXME: are there intrinsics we need to exclude from this?
1890 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1891 auto code = static_cast<RTLIB::Libcall>(i);
1892 auto libcallName = getLibcallName(code);
1893 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1894 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1895 }
1896 }
1897 }
1898}
1899
1900void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1901 assert(VT.isVector() && "VT should be a vector type");
1902
1903 if (VT.isFloatingPoint()) {
1905 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1906 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1907 }
1908
1909 // Mark vector float intrinsics as expand.
1910 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1928 }
1929
1930 // But we do support custom-lowering for FCOPYSIGN.
1931 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1932 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1933 VT == MVT::v8f16) &&
1934 Subtarget->hasFullFP16()))
1936
1949
1953 for (MVT InnerVT : MVT::all_valuetypes())
1954 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1955
1956 // CNT supports only B element sizes, then use UADDLP to widen.
1957 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1959
1965
1966 for (unsigned Opcode :
1969 setOperationAction(Opcode, VT, Custom);
1970
1971 if (!VT.isFloatingPoint())
1973
1974 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1975 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1976 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1977 setOperationAction(Opcode, VT, Legal);
1978
1979 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1980 // NEON types.
1981 if (VT.isFloatingPoint() &&
1982 VT.getVectorElementType() != MVT::bf16 &&
1983 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1984 for (unsigned Opcode :
1990 setOperationAction(Opcode, VT, Legal);
1991
1992 // Strict fp extend and trunc are legal
1993 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1995 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1997
1998 // FIXME: We could potentially make use of the vector comparison instructions
1999 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2000 // complications:
2001 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2002 // so we would need to expand when the condition code doesn't match the
2003 // kind of comparison.
2004 // * Some kinds of comparison require more than one FCMXY instruction so
2005 // would need to be expanded instead.
2006 // * The lowering of the non-strict versions involves target-specific ISD
2007 // nodes so we would likely need to add strict versions of all of them and
2008 // handle them appropriately.
2011
2012 if (Subtarget->isLittleEndian()) {
2013 for (unsigned im = (unsigned)ISD::PRE_INC;
2017 }
2018 }
2019
2020 if (Subtarget->hasD128()) {
2023 }
2024}
2025
2027 EVT OpVT) const {
2028 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2029 if (!Subtarget->hasSVE())
2030 return true;
2031
2032 // We can only support legal predicate result types. We can use the SVE
2033 // whilelo instruction for generating fixed-width predicates too.
2034 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
2035 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
2036 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
2037 return true;
2038
2039 // The whilelo instruction only works with i32 or i64 scalar inputs.
2040 if (OpVT != MVT::i32 && OpVT != MVT::i64)
2041 return true;
2042
2043 return false;
2044}
2045
2047 const IntrinsicInst *I) const {
2048 if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
2049 return true;
2050
2051 EVT VT = EVT::getEVT(I->getType());
2052 auto Op1 = I->getOperand(1);
2053 EVT Op1VT = EVT::getEVT(Op1->getType());
2054 if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
2055 (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
2056 VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
2057 return false;
2058 return true;
2059}
2060
2062 if (!Subtarget->isSVEorStreamingSVEAvailable())
2063 return true;
2064
2065 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2066 // also support fixed-width predicates.
2067 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2068 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2069 VT != MVT::v4i1 && VT != MVT::v2i1;
2070}
2071
2073 unsigned SearchSize) const {
2074 // MATCH is SVE2 and only available in non-streaming mode.
2075 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2076 return true;
2077 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2078 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2079 return SearchSize != 8;
2080 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2081 return SearchSize != 8 && SearchSize != 16;
2082 return true;
2083}
2084
2085void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2086 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2087
2088 // By default everything must be expanded.
2089 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2091
2092 if (VT.isFloatingPoint()) {
2102 }
2103
2105 VT == MVT::v1f64 ? Expand : Custom;
2106
2107 // Mark integer truncating stores/extending loads as having custom lowering
2108 if (VT.isInteger()) {
2109 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2110 while (InnerVT != VT) {
2111 setTruncStoreAction(VT, InnerVT, Default);
2112 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2113 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2114 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2115 InnerVT = InnerVT.changeVectorElementType(
2116 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2117 }
2118 }
2119
2120 // Mark floating-point truncating stores/extending loads as having custom
2121 // lowering
2122 if (VT.isFloatingPoint()) {
2123 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2124 while (InnerVT != VT) {
2125 setTruncStoreAction(VT, InnerVT, Custom);
2126 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2127 InnerVT = InnerVT.changeVectorElementType(
2129 }
2130 }
2131
2132 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2133 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2134
2135 // Lower fixed length vector operations to scalable equivalents.
2142 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2179 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2180 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2182 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2201 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2227}
2228
2229void AArch64TargetLowering::addDRType(MVT VT) {
2230 addRegisterClass(VT, &AArch64::FPR64RegClass);
2231 if (Subtarget->isNeonAvailable())
2232 addTypeForNEON(VT);
2233}
2234
2235void AArch64TargetLowering::addQRType(MVT VT) {
2236 addRegisterClass(VT, &AArch64::FPR128RegClass);
2237 if (Subtarget->isNeonAvailable())
2238 addTypeForNEON(VT);
2239}
2240
2242 LLVMContext &C, EVT VT) const {
2243 if (!VT.isVector())
2244 return MVT::i32;
2245 if (VT.isScalableVector())
2246 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2248}
2249
2250// isIntImmediate - This method tests to see if the node is a constant
2251// operand. If so Imm will receive the value.
2252static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2253 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2254 Imm = C->getZExtValue();
2255 return true;
2256 }
2257 return false;
2258}
2259
2260// isOpcWithIntImmediate - This method tests to see if the node is a specific
2261// opcode and that it has a immediate integer right operand.
2262// If so Imm will receive the value.
2263static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2264 uint64_t &Imm) {
2265 return N->getOpcode() == Opc &&
2266 isIntImmediate(N->getOperand(1).getNode(), Imm);
2267}
2268
2269static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2270 const APInt &Demanded,
2272 unsigned NewOpc) {
2273 uint64_t OldImm = Imm, NewImm, Enc;
2274 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2275
2276 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2277 // bimm64.
2278 if (Imm == 0 || Imm == Mask ||
2280 return false;
2281
2282 unsigned EltSize = Size;
2283 uint64_t DemandedBits = Demanded.getZExtValue();
2284
2285 // Clear bits that are not demanded.
2286 Imm &= DemandedBits;
2287
2288 while (true) {
2289 // The goal here is to set the non-demanded bits in a way that minimizes
2290 // the number of switching between 0 and 1. In order to achieve this goal,
2291 // we set the non-demanded bits to the value of the preceding demanded bits.
2292 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2293 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2294 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2295 // The final result is 0b11000011.
2296 uint64_t NonDemandedBits = ~DemandedBits;
2297 uint64_t InvertedImm = ~Imm & DemandedBits;
2298 uint64_t RotatedImm =
2299 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2300 NonDemandedBits;
2301 uint64_t Sum = RotatedImm + NonDemandedBits;
2302 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2303 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2304 NewImm = (Imm | Ones) & Mask;
2305
2306 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2307 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2308 // we halve the element size and continue the search.
2309 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2310 break;
2311
2312 // We cannot shrink the element size any further if it is 2-bits.
2313 if (EltSize == 2)
2314 return false;
2315
2316 EltSize /= 2;
2317 Mask >>= EltSize;
2318 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2319
2320 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2321 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2322 return false;
2323
2324 // Merge the upper and lower halves of Imm and DemandedBits.
2325 Imm |= Hi;
2326 DemandedBits |= DemandedBitsHi;
2327 }
2328
2329 ++NumOptimizedImms;
2330
2331 // Replicate the element across the register width.
2332 while (EltSize < Size) {
2333 NewImm |= NewImm << EltSize;
2334 EltSize *= 2;
2335 }
2336
2337 (void)OldImm;
2338 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2339 "demanded bits should never be altered");
2340 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2341
2342 // Create the new constant immediate node.
2343 EVT VT = Op.getValueType();
2344 SDLoc DL(Op);
2345 SDValue New;
2346
2347 // If the new constant immediate is all-zeros or all-ones, let the target
2348 // independent DAG combine optimize this node.
2349 if (NewImm == 0 || NewImm == OrigMask) {
2350 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2351 TLO.DAG.getConstant(NewImm, DL, VT));
2352 // Otherwise, create a machine node so that target independent DAG combine
2353 // doesn't undo this optimization.
2354 } else {
2356 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2357 New = SDValue(
2358 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2359 }
2360
2361 return TLO.CombineTo(Op, New);
2362}
2363
2365 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2366 TargetLoweringOpt &TLO) const {
2367 // Delay this optimization to as late as possible.
2368 if (!TLO.LegalOps)
2369 return false;
2370
2372 return false;
2373
2374 EVT VT = Op.getValueType();
2375 if (VT.isVector())
2376 return false;
2377
2378 unsigned Size = VT.getSizeInBits();
2379
2380 if (Size != 32 && Size != 64)
2381 return false;
2382
2383 // Exit early if we demand all bits.
2384 if (DemandedBits.popcount() == Size)
2385 return false;
2386
2387 unsigned NewOpc;
2388 switch (Op.getOpcode()) {
2389 default:
2390 return false;
2391 case ISD::AND:
2392 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2393 break;
2394 case ISD::OR:
2395 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2396 break;
2397 case ISD::XOR:
2398 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2399 break;
2400 }
2401 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2402 if (!C)
2403 return false;
2404 uint64_t Imm = C->getZExtValue();
2405 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2406}
2407
2408/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2409/// Mask are known to be either zero or one and return them Known.
2411 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2412 const SelectionDAG &DAG, unsigned Depth) const {
2413 switch (Op.getOpcode()) {
2414 default:
2415 break;
2416 case AArch64ISD::DUP: {
2417 SDValue SrcOp = Op.getOperand(0);
2418 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2419 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2420 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2421 "Expected DUP implicit truncation");
2422 Known = Known.trunc(Op.getScalarValueSizeInBits());
2423 }
2424 break;
2425 }
2426 case AArch64ISD::CSEL: {
2427 KnownBits Known2;
2428 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2429 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2430 Known = Known.intersectWith(Known2);
2431 break;
2432 }
2433 case AArch64ISD::BICi: {
2434 // Compute the bit cleared value.
2435 APInt Mask =
2436 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2437 .trunc(Known.getBitWidth());
2438 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2439 Known &= KnownBits::makeConstant(Mask);
2440 break;
2441 }
2442 case AArch64ISD::VLSHR: {
2443 KnownBits Known2;
2444 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2445 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2446 Known = KnownBits::lshr(Known, Known2);
2447 break;
2448 }
2449 case AArch64ISD::VASHR: {
2450 KnownBits Known2;
2451 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2452 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2453 Known = KnownBits::ashr(Known, Known2);
2454 break;
2455 }
2456 case AArch64ISD::VSHL: {
2457 KnownBits Known2;
2458 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2459 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2460 Known = KnownBits::shl(Known, Known2);
2461 break;
2462 }
2463 case AArch64ISD::MOVI: {
2465 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2466 break;
2467 }
2469 case AArch64ISD::ADDlow: {
2470 if (!Subtarget->isTargetILP32())
2471 break;
2472 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2473 Known.Zero = APInt::getHighBitsSet(64, 32);
2474 break;
2475 }
2477 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2478 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2479 break;
2480 }
2482 Intrinsic::ID IntID =
2483 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2484 switch (IntID) {
2485 default: return;
2486 case Intrinsic::aarch64_ldaxr:
2487 case Intrinsic::aarch64_ldxr: {
2488 unsigned BitWidth = Known.getBitWidth();
2489 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2490 unsigned MemBits = VT.getScalarSizeInBits();
2491 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2492 return;
2493 }
2494 }
2495 break;
2496 }
2498 case ISD::INTRINSIC_VOID: {
2499 unsigned IntNo = Op.getConstantOperandVal(0);
2500 switch (IntNo) {
2501 default:
2502 break;
2503 case Intrinsic::aarch64_neon_uaddlv: {
2504 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2505 unsigned BitWidth = Known.getBitWidth();
2506 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2507 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2508 assert(BitWidth >= Bound && "Unexpected width!");
2510 Known.Zero |= Mask;
2511 }
2512 break;
2513 }
2514 case Intrinsic::aarch64_neon_umaxv:
2515 case Intrinsic::aarch64_neon_uminv: {
2516 // Figure out the datatype of the vector operand. The UMINV instruction
2517 // will zero extend the result, so we can mark as known zero all the
2518 // bits larger than the element datatype. 32-bit or larget doesn't need
2519 // this as those are legal types and will be handled by isel directly.
2520 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2521 unsigned BitWidth = Known.getBitWidth();
2522 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2523 assert(BitWidth >= 8 && "Unexpected width!");
2525 Known.Zero |= Mask;
2526 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2527 assert(BitWidth >= 16 && "Unexpected width!");
2529 Known.Zero |= Mask;
2530 }
2531 break;
2532 } break;
2533 }
2534 }
2535 }
2536}
2537
2539 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2540 unsigned Depth) const {
2541 EVT VT = Op.getValueType();
2542 unsigned VTBits = VT.getScalarSizeInBits();
2543 unsigned Opcode = Op.getOpcode();
2544 switch (Opcode) {
2545 case AArch64ISD::CMEQ:
2546 case AArch64ISD::CMGE:
2547 case AArch64ISD::CMGT:
2548 case AArch64ISD::CMHI:
2549 case AArch64ISD::CMHS:
2550 case AArch64ISD::FCMEQ:
2551 case AArch64ISD::FCMGE:
2552 case AArch64ISD::FCMGT:
2553 case AArch64ISD::CMEQz:
2554 case AArch64ISD::CMGEz:
2555 case AArch64ISD::CMGTz:
2556 case AArch64ISD::CMLEz:
2557 case AArch64ISD::CMLTz:
2558 case AArch64ISD::FCMEQz:
2559 case AArch64ISD::FCMGEz:
2560 case AArch64ISD::FCMGTz:
2561 case AArch64ISD::FCMLEz:
2562 case AArch64ISD::FCMLTz:
2563 // Compares return either 0 or all-ones
2564 return VTBits;
2565 case AArch64ISD::VASHR: {
2566 unsigned Tmp =
2567 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2568 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2569 }
2570 }
2571
2572 return 1;
2573}
2574
2576 EVT) const {
2577 return MVT::i64;
2578}
2579
2581 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2582 unsigned *Fast) const {
2583
2584 // Allow SVE loads/stores where the alignment >= the size of the element type,
2585 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2586 // for stores that come from IR, only require element-size alignment (even if
2587 // unaligned accesses are disabled). Without this, these will be forced to
2588 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2589 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2590 if (VT.isScalableVector()) {
2591 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2592 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2593 return true;
2594 }
2595
2596 if (Subtarget->requiresStrictAlign())
2597 return false;
2598
2599 if (Fast) {
2600 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2601 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2602 // See comments in performSTORECombine() for more details about
2603 // these conditions.
2604
2605 // Code that uses clang vector extensions can mark that it
2606 // wants unaligned accesses to be treated as fast by
2607 // underspecifying alignment to be 1 or 2.
2608 Alignment <= 2 ||
2609
2610 // Disregard v2i64. Memcpy lowering produces those and splitting
2611 // them regresses performance on micro-benchmarks and olden/bh.
2612 VT == MVT::v2i64;
2613 }
2614 return true;
2615}
2616
2617// Same as above but handling LLTs instead.
2619 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2620 unsigned *Fast) const {
2621 if (Subtarget->requiresStrictAlign())
2622 return false;
2623
2624 if (Fast) {
2625 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2626 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2627 Ty.getSizeInBytes() != 16 ||
2628 // See comments in performSTORECombine() for more details about
2629 // these conditions.
2630
2631 // Code that uses clang vector extensions can mark that it
2632 // wants unaligned accesses to be treated as fast by
2633 // underspecifying alignment to be 1 or 2.
2634 Alignment <= 2 ||
2635
2636 // Disregard v2i64. Memcpy lowering produces those and splitting
2637 // them regresses performance on micro-benchmarks and olden/bh.
2638 Ty == LLT::fixed_vector(2, 64);
2639 }
2640 return true;
2641}
2642
2643FastISel *
2645 const TargetLibraryInfo *libInfo) const {
2646 return AArch64::createFastISel(funcInfo, libInfo);
2647}
2648
2649const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2650#define MAKE_CASE(V) \
2651 case V: \
2652 return #V;
2653 switch ((AArch64ISD::NodeType)Opcode) {
2655 break;
2984 }
2985#undef MAKE_CASE
2986 return nullptr;
2987}
2988
2991 MachineBasicBlock *MBB) const {
2992 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2993 // phi node:
2994
2995 // OrigBB:
2996 // [... previous instrs leading to comparison ...]
2997 // b.ne TrueBB
2998 // b EndBB
2999 // TrueBB:
3000 // ; Fallthrough
3001 // EndBB:
3002 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
3003
3004 MachineFunction *MF = MBB->getParent();
3005 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3006 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3007 DebugLoc DL = MI.getDebugLoc();
3009
3010 Register DestReg = MI.getOperand(0).getReg();
3011 Register IfTrueReg = MI.getOperand(1).getReg();
3012 Register IfFalseReg = MI.getOperand(2).getReg();
3013 unsigned CondCode = MI.getOperand(3).getImm();
3014 bool NZCVKilled = MI.getOperand(4).isKill();
3015
3016 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3017 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3018 MF->insert(It, TrueBB);
3019 MF->insert(It, EndBB);
3020
3021 // Transfer rest of current basic-block to EndBB
3022 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3023 MBB->end());
3025
3026 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3027 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3028 MBB->addSuccessor(TrueBB);
3029 MBB->addSuccessor(EndBB);
3030
3031 // TrueBB falls through to the end.
3032 TrueBB->addSuccessor(EndBB);
3033
3034 if (!NZCVKilled) {
3035 TrueBB->addLiveIn(AArch64::NZCV);
3036 EndBB->addLiveIn(AArch64::NZCV);
3037 }
3038
3039 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3040 .addReg(IfTrueReg)
3041 .addMBB(TrueBB)
3042 .addReg(IfFalseReg)
3043 .addMBB(MBB);
3044
3045 MI.eraseFromParent();
3046 return EndBB;
3047}
3048
3050 MachineInstr &MI, MachineBasicBlock *BB) const {
3052 BB->getParent()->getFunction().getPersonalityFn())) &&
3053 "SEH does not use catchret!");
3054 return BB;
3055}
3056
3059 MachineBasicBlock *MBB) const {
3060 MachineFunction &MF = *MBB->getParent();
3061 MachineBasicBlock::iterator MBBI = MI.getIterator();
3063 const AArch64InstrInfo &TII =
3064 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3065 Register TargetReg = MI.getOperand(0).getReg();
3067 TII.probedStackAlloc(MBBI, TargetReg, false);
3068
3069 MI.eraseFromParent();
3070 return NextInst->getParent();
3071}
3072
3074AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3076 MachineBasicBlock *BB) const {
3077 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3078 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3079
3080 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3081 MIB.add(MI.getOperand(1)); // slice index register
3082 MIB.add(MI.getOperand(2)); // slice index offset
3083 MIB.add(MI.getOperand(3)); // pg
3084 MIB.add(MI.getOperand(4)); // base
3085 MIB.add(MI.getOperand(5)); // offset
3086
3087 MI.eraseFromParent(); // The pseudo is gone now.
3088 return BB;
3089}
3090
3093 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3095 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3096
3097 MIB.addReg(AArch64::ZA, RegState::Define);
3098 MIB.add(MI.getOperand(0)); // Vector select register
3099 MIB.add(MI.getOperand(1)); // Vector select offset
3100 MIB.add(MI.getOperand(2)); // Base
3101 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3102
3103 MI.eraseFromParent(); // The pseudo is gone now.
3104 return BB;
3105}
3106
3109 unsigned Opcode,
3110 bool Op0IsDef) const {
3111 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3113
3114 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3115 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3116 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3117 MIB.add(MI.getOperand(I));
3118
3119 MI.eraseFromParent(); // The pseudo is gone now.
3120 return BB;
3121}
3122
3124AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3126 MachineBasicBlock *BB) const {
3127 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3128 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3129 unsigned StartIdx = 0;
3130
3131 bool HasTile = BaseReg != AArch64::ZA;
3132 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3133 if (HasZPROut) {
3134 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3135 ++StartIdx;
3136 }
3137 if (HasTile) {
3138 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3139 RegState::Define); // Output ZA Tile
3140 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3141 StartIdx++;
3142 } else {
3143 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3144 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3145 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3146 ++StartIdx;
3147 }
3148 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3149 }
3150 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3151 MIB.add(MI.getOperand(I));
3152
3153 MI.eraseFromParent(); // The pseudo is gone now.
3154 return BB;
3155}
3156
3159 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3161 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3162 MIB.add(MI.getOperand(0)); // Mask
3163
3164 unsigned Mask = MI.getOperand(0).getImm();
3165 for (unsigned I = 0; I < 8; I++) {
3166 if (Mask & (1 << I))
3167 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3168 }
3169
3170 MI.eraseFromParent(); // The pseudo is gone now.
3171 return BB;
3172}
3173
3176 MachineBasicBlock *BB) const {
3177 MachineFunction *MF = BB->getParent();
3178 MachineFrameInfo &MFI = MF->getFrameInfo();
3180 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3181 if (TPIDR2.Uses > 0) {
3182 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3183 // Store the buffer pointer to the TPIDR2 stack object.
3184 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3185 .addReg(MI.getOperand(0).getReg())
3186 .addFrameIndex(TPIDR2.FrameIndex)
3187 .addImm(0);
3188 // Set the reserved bytes (10-15) to zero
3189 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3190 .addReg(AArch64::WZR)
3191 .addFrameIndex(TPIDR2.FrameIndex)
3192 .addImm(5);
3193 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3194 .addReg(AArch64::WZR)
3195 .addFrameIndex(TPIDR2.FrameIndex)
3196 .addImm(3);
3197 } else
3198 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3199
3200 BB->remove_instr(&MI);
3201 return BB;
3202}
3203
3206 MachineBasicBlock *BB) const {
3207 MachineFunction *MF = BB->getParent();
3208 MachineFrameInfo &MFI = MF->getFrameInfo();
3210 // TODO This function grows the stack with a subtraction, which doesn't work
3211 // on Windows. Some refactoring to share the functionality in
3212 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3213 // supports SME
3215 "Lazy ZA save is not yet supported on Windows");
3216
3217 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3218
3219 if (TPIDR2.Uses > 0) {
3220 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3222
3223 // The SUBXrs below won't always be emitted in a form that accepts SP
3224 // directly
3225 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3226 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3227 .addReg(AArch64::SP);
3228
3229 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3230 auto Size = MI.getOperand(1).getReg();
3231 auto Dest = MI.getOperand(0).getReg();
3232 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3233 .addReg(Size)
3234 .addReg(Size)
3235 .addReg(SP);
3236 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3237 AArch64::SP)
3238 .addReg(Dest);
3239
3240 // We have just allocated a variable sized object, tell this to PEI.
3241 MFI.CreateVariableSizedObject(Align(16), nullptr);
3242 }
3243
3244 BB->remove_instr(&MI);
3245 return BB;
3246}
3247
3248// TODO: Find a way to merge this with EmitAllocateZABuffer.
3251 MachineBasicBlock *BB) const {
3252 MachineFunction *MF = BB->getParent();
3253 MachineFrameInfo &MFI = MF->getFrameInfo();
3256 "Lazy ZA save is not yet supported on Windows");
3257
3258 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3259 if (FuncInfo->isSMESaveBufferUsed()) {
3260 // Allocate a buffer object of the size given by MI.getOperand(1).
3261 auto Size = MI.getOperand(1).getReg();
3262 auto Dest = MI.getOperand(0).getReg();
3263 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3264 .addReg(AArch64::SP)
3265 .addReg(Size)
3267 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3268 .addReg(AArch64::SP);
3269
3270 // We have just allocated a variable sized object, tell this to PEI.
3271 MFI.CreateVariableSizedObject(Align(16), nullptr);
3272 } else
3273 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3274 MI.getOperand(0).getReg());
3275
3276 BB->remove_instr(&MI);
3277 return BB;
3278}
3279
3282 MachineBasicBlock *BB) const {
3283 // If the buffer is used, emit a call to __arm_sme_state_size()
3284 MachineFunction *MF = BB->getParent();
3286 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3287 if (FuncInfo->isSMESaveBufferUsed()) {
3288 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3289 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3290 .addExternalSymbol("__arm_sme_state_size")
3291 .addReg(AArch64::X0, RegState::ImplicitDefine)
3292 .addRegMask(TRI->getCallPreservedMask(
3293 *MF, CallingConv::
3295 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3296 MI.getOperand(0).getReg())
3297 .addReg(AArch64::X0);
3298 } else
3299 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3300 MI.getOperand(0).getReg())
3301 .addReg(AArch64::XZR);
3302 BB->remove_instr(&MI);
3303 return BB;
3304}
3305
3307 MachineInstr &MI, MachineBasicBlock *BB) const {
3308
3309 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3310 if (SMEOrigInstr != -1) {
3311 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3312 uint64_t SMEMatrixType =
3313 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3314 switch (SMEMatrixType) {
3316 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3318 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3320 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3322 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3324 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3326 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3327 }
3328 }
3329
3330 switch (MI.getOpcode()) {
3331 default:
3332#ifndef NDEBUG
3333 MI.dump();
3334#endif
3335 llvm_unreachable("Unexpected instruction for custom inserter!");
3336 case AArch64::InitTPIDR2Obj:
3337 return EmitInitTPIDR2Object(MI, BB);
3338 case AArch64::AllocateZABuffer:
3339 return EmitAllocateZABuffer(MI, BB);
3340 case AArch64::AllocateSMESaveBuffer:
3341 return EmitAllocateSMESaveBuffer(MI, BB);
3342 case AArch64::GetSMESaveSize:
3343 return EmitGetSMESaveSize(MI, BB);
3344 case AArch64::F128CSEL:
3345 return EmitF128CSEL(MI, BB);
3346 case TargetOpcode::STATEPOINT:
3347 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3348 // while bl call instruction (where statepoint will be lowered at the end)
3349 // has implicit def. This def is early-clobber as it will be set at
3350 // the moment of the call and earlier than any use is read.
3351 // Add this implicit dead def here as a workaround.
3352 MI.addOperand(*MI.getMF(),
3354 AArch64::LR, /*isDef*/ true,
3355 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3356 /*isUndef*/ false, /*isEarlyClobber*/ true));
3357 [[fallthrough]];
3358 case TargetOpcode::STACKMAP:
3359 case TargetOpcode::PATCHPOINT:
3360 return emitPatchPoint(MI, BB);
3361
3362 case TargetOpcode::PATCHABLE_EVENT_CALL:
3363 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3364 return BB;
3365
3366 case AArch64::CATCHRET:
3367 return EmitLoweredCatchRet(MI, BB);
3368
3369 case AArch64::PROBED_STACKALLOC_DYN:
3370 return EmitDynamicProbedAlloc(MI, BB);
3371
3372 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3373 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3374 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3375 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3376 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3377 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3378 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3379 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3380 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3381 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3382 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3383 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3384 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3385 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3386 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3387 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3388 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3389 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3390 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3391 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3392 case AArch64::LDR_ZA_PSEUDO:
3393 return EmitFill(MI, BB);
3394 case AArch64::LDR_TX_PSEUDO:
3395 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3396 case AArch64::STR_TX_PSEUDO:
3397 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3398 case AArch64::ZERO_M_PSEUDO:
3399 return EmitZero(MI, BB);
3400 case AArch64::ZERO_T_PSEUDO:
3401 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3402 case AArch64::MOVT_TIZ_PSEUDO:
3403 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3404 }
3405}
3406
3407//===----------------------------------------------------------------------===//
3408// AArch64 Lowering private implementation.
3409//===----------------------------------------------------------------------===//
3410
3411//===----------------------------------------------------------------------===//
3412// Lowering Code
3413//===----------------------------------------------------------------------===//
3414
3415// Forward declarations of SVE fixed length lowering helpers
3420 SelectionDAG &DAG);
3423 EVT VT);
3424
3425/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3426static bool isZerosVector(const SDNode *N) {
3427 // Look through a bit convert.
3428 while (N->getOpcode() == ISD::BITCAST)
3429 N = N->getOperand(0).getNode();
3430
3432 return true;
3433
3434 if (N->getOpcode() != AArch64ISD::DUP)
3435 return false;
3436
3437 auto Opnd0 = N->getOperand(0);
3438 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3439}
3440
3441/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3442/// CC
3444 switch (CC) {
3445 default:
3446 llvm_unreachable("Unknown condition code!");
3447 case ISD::SETNE:
3448 return AArch64CC::NE;
3449 case ISD::SETEQ:
3450 return AArch64CC::EQ;
3451 case ISD::SETGT:
3452 return AArch64CC::GT;
3453 case ISD::SETGE:
3454 return AArch64CC::GE;
3455 case ISD::SETLT:
3456 return AArch64CC::LT;
3457 case ISD::SETLE:
3458 return AArch64CC::LE;
3459 case ISD::SETUGT:
3460 return AArch64CC::HI;
3461 case ISD::SETUGE:
3462 return AArch64CC::HS;
3463 case ISD::SETULT:
3464 return AArch64CC::LO;
3465 case ISD::SETULE:
3466 return AArch64CC::LS;
3467 }
3468}
3469
3470/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3472 AArch64CC::CondCode &CondCode,
3473 AArch64CC::CondCode &CondCode2) {
3474 CondCode2 = AArch64CC::AL;
3475 switch (CC) {
3476 default:
3477 llvm_unreachable("Unknown FP condition!");
3478 case ISD::SETEQ:
3479 case ISD::SETOEQ:
3480 CondCode = AArch64CC::EQ;
3481 break;
3482 case ISD::SETGT:
3483 case ISD::SETOGT:
3484 CondCode = AArch64CC::GT;
3485 break;
3486 case ISD::SETGE:
3487 case ISD::SETOGE:
3488 CondCode = AArch64CC::GE;
3489 break;
3490 case ISD::SETOLT:
3491 CondCode = AArch64CC::MI;
3492 break;
3493 case ISD::SETOLE:
3494 CondCode = AArch64CC::LS;
3495 break;
3496 case ISD::SETONE:
3497 CondCode = AArch64CC::MI;
3498 CondCode2 = AArch64CC::GT;
3499 break;
3500 case ISD::SETO:
3501 CondCode = AArch64CC::VC;
3502 break;
3503 case ISD::SETUO:
3504 CondCode = AArch64CC::VS;
3505 break;
3506 case ISD::SETUEQ:
3507 CondCode = AArch64CC::EQ;
3508 CondCode2 = AArch64CC::VS;
3509 break;
3510 case ISD::SETUGT:
3511 CondCode = AArch64CC::HI;
3512 break;
3513 case ISD::SETUGE:
3514 CondCode = AArch64CC::PL;
3515 break;
3516 case ISD::SETLT:
3517 case ISD::SETULT:
3518 CondCode = AArch64CC::LT;
3519 break;
3520 case ISD::SETLE:
3521 case ISD::SETULE:
3522 CondCode = AArch64CC::LE;
3523 break;
3524 case ISD::SETNE:
3525 case ISD::SETUNE:
3526 CondCode = AArch64CC::NE;
3527 break;
3528 }
3529}
3530
3531/// Convert a DAG fp condition code to an AArch64 CC.
3532/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3533/// should be AND'ed instead of OR'ed.
3535 AArch64CC::CondCode &CondCode,
3536 AArch64CC::CondCode &CondCode2) {
3537 CondCode2 = AArch64CC::AL;
3538 switch (CC) {
3539 default:
3540 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3541 assert(CondCode2 == AArch64CC::AL);
3542 break;
3543 case ISD::SETONE:
3544 // (a one b)
3545 // == ((a olt b) || (a ogt b))
3546 // == ((a ord b) && (a une b))
3547 CondCode = AArch64CC::VC;
3548 CondCode2 = AArch64CC::NE;
3549 break;
3550 case ISD::SETUEQ:
3551 // (a ueq b)
3552 // == ((a uno b) || (a oeq b))
3553 // == ((a ule b) && (a uge b))
3554 CondCode = AArch64CC::PL;
3555 CondCode2 = AArch64CC::LE;
3556 break;
3557 }
3558}
3559
3560/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3561/// CC usable with the vector instructions. Fewer operations are available
3562/// without a real NZCV register, so we have to use less efficient combinations
3563/// to get the same effect.
3565 AArch64CC::CondCode &CondCode,
3566 AArch64CC::CondCode &CondCode2,
3567 bool &Invert) {
3568 Invert = false;
3569 switch (CC) {
3570 default:
3571 // Mostly the scalar mappings work fine.
3572 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3573 break;
3574 case ISD::SETUO:
3575 Invert = true;
3576 [[fallthrough]];
3577 case ISD::SETO:
3578 CondCode = AArch64CC::MI;
3579 CondCode2 = AArch64CC::GE;
3580 break;
3581 case ISD::SETUEQ:
3582 case ISD::SETULT:
3583 case ISD::SETULE:
3584 case ISD::SETUGT:
3585 case ISD::SETUGE:
3586 // All of the compare-mask comparisons are ordered, but we can switch
3587 // between the two by a double inversion. E.g. ULE == !OGT.
3588 Invert = true;
3589 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3590 CondCode, CondCode2);
3591 break;
3592 }
3593}
3594
3596 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3597 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3598 LLVM_DEBUG(dbgs() << "Is imm " << C
3599 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3600 return IsLegal;
3601}
3602
3603static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3604 KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
3605 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3606}
3607
3608// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3609// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3610// can be set differently by this operation. It comes down to whether
3611// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3612// everything is fine. If not then the optimization is wrong. Thus general
3613// comparisons are only valid if op2 != 0.
3614//
3615// So, finally, the only LLVM-native comparisons that don't mention C or V
3616// are the ones that aren't unsigned comparisons. They're the only ones we can
3617// safely use CMN for in the absence of information about op2.
3619 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3620 (isIntEqualitySetCC(CC) ||
3621 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3622 (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
3623}
3624
3626 SelectionDAG &DAG, SDValue Chain,
3627 bool IsSignaling) {
3628 EVT VT = LHS.getValueType();
3629 assert(VT != MVT::f128);
3630
3631 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3632
3633 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3634 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3635 {Chain, LHS});
3636 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3637 {LHS.getValue(1), RHS});
3638 Chain = RHS.getValue(1);
3639 }
3640 unsigned Opcode =
3642 return DAG.getNode(Opcode, dl, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
3643}
3644
3646 const SDLoc &dl, SelectionDAG &DAG) {
3647 EVT VT = LHS.getValueType();
3648 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3649
3650 if (VT.isFloatingPoint()) {
3651 assert(VT != MVT::f128);
3652 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3653 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3654 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3655 }
3656 return DAG.getNode(AArch64ISD::FCMP, dl, MVT::i32, LHS, RHS);
3657 }
3658
3659 // The CMP instruction is just an alias for SUBS, and representing it as
3660 // SUBS means that it's possible to get CSE with subtract operations.
3661 // A later phase can perform the optimization of setting the destination
3662 // register to WZR/XZR if it ends up being unused.
3663 unsigned Opcode = AArch64ISD::SUBS;
3664
3665 if (isCMN(RHS, CC, DAG)) {
3666 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3667 Opcode = AArch64ISD::ADDS;
3668 RHS = RHS.getOperand(1);
3669 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3670 isIntEqualitySetCC(CC)) {
3671 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3672 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3673 Opcode = AArch64ISD::ADDS;
3674 LHS = LHS.getOperand(1);
3675 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3676 if (LHS.getOpcode() == ISD::AND) {
3677 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3678 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3679 // of the signed comparisons.
3680 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3681 DAG.getVTList(VT, MVT_CC),
3682 LHS.getOperand(0),
3683 LHS.getOperand(1));
3684 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3685 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3686 return ANDSNode.getValue(1);
3687 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3688 // Use result of ANDS
3689 return LHS.getValue(1);
3690 }
3691 }
3692
3693 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3694 .getValue(1);
3695}
3696
3697/// \defgroup AArch64CCMP CMP;CCMP matching
3698///
3699/// These functions deal with the formation of CMP;CCMP;... sequences.
3700/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3701/// a comparison. They set the NZCV flags to a predefined value if their
3702/// predicate is false. This allows to express arbitrary conjunctions, for
3703/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3704/// expressed as:
3705/// cmp A
3706/// ccmp B, inv(CB), CA
3707/// check for CB flags
3708///
3709/// This naturally lets us implement chains of AND operations with SETCC
3710/// operands. And we can even implement some other situations by transforming
3711/// them:
3712/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3713/// negating the flags used in a CCMP/FCCMP operations.
3714/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3715/// by negating the flags we test for afterwards. i.e.
3716/// NEG (CMP CCMP CCCMP ...) can be implemented.
3717/// - Note that we can only ever negate all previously processed results.
3718/// What we can not implement by flipping the flags to test is a negation
3719/// of two sub-trees (because the negation affects all sub-trees emitted so
3720/// far, so the 2nd sub-tree we emit would also affect the first).
3721/// With those tools we can implement some OR operations:
3722/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3723/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3724/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3725/// elimination rules from earlier to implement the whole thing as a
3726/// CCMP/FCCMP chain.
3727///
3728/// As complete example:
3729/// or (or (setCA (cmp A)) (setCB (cmp B)))
3730/// (and (setCC (cmp C)) (setCD (cmp D)))"
3731/// can be reassociated to:
3732/// or (and (setCC (cmp C)) setCD (cmp D))
3733// (or (setCA (cmp A)) (setCB (cmp B)))
3734/// can be transformed to:
3735/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3736/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3737/// which can be implemented as:
3738/// cmp C
3739/// ccmp D, inv(CD), CC
3740/// ccmp A, CA, inv(CD)
3741/// ccmp B, CB, inv(CA)
3742/// check for CB flags
3743///
3744/// A counterexample is "or (and A B) (and C D)" which translates to
3745/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3746/// can only implement 1 of the inner (not) operations, but not both!
3747/// @{
3748
3749/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3751 ISD::CondCode CC, SDValue CCOp,
3753 AArch64CC::CondCode OutCC,
3754 const SDLoc &DL, SelectionDAG &DAG) {
3755 unsigned Opcode = 0;
3756 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3757
3758 if (LHS.getValueType().isFloatingPoint()) {
3759 assert(LHS.getValueType() != MVT::f128);
3760 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3761 LHS.getValueType() == MVT::bf16) {
3762 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3763 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3764 }
3765 Opcode = AArch64ISD::FCCMP;
3766 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3767 APInt Imm = Const->getAPIntValue();
3768 if (Imm.isNegative() && Imm.sgt(-32)) {
3769 Opcode = AArch64ISD::CCMN;
3770 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3771 }
3772 } else if (isCMN(RHS, CC, DAG)) {
3773 Opcode = AArch64ISD::CCMN;
3774 RHS = RHS.getOperand(1);
3775 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3776 isIntEqualitySetCC(CC)) {
3777 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3778 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3779 Opcode = AArch64ISD::CCMN;
3780 LHS = LHS.getOperand(1);
3781 }
3782 if (Opcode == 0)
3783 Opcode = AArch64ISD::CCMP;
3784
3785 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3787 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3788 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3789 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3790}
3791
3792/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3793/// expressed as a conjunction. See \ref AArch64CCMP.
3794/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3795/// changing the conditions on the SETCC tests.
3796/// (this means we can call emitConjunctionRec() with
3797/// Negate==true on this sub-tree)
3798/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3799/// cannot do the negation naturally. We are required to
3800/// emit the subtree first in this case.
3801/// \param WillNegate Is true if are called when the result of this
3802/// subexpression must be negated. This happens when the
3803/// outer expression is an OR. We can use this fact to know
3804/// that we have a double negation (or (or ...) ...) that
3805/// can be implemented for free.
3806static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3807 bool &MustBeFirst, bool WillNegate,
3808 unsigned Depth = 0) {
3809 if (!Val.hasOneUse())
3810 return false;
3811 unsigned Opcode = Val->getOpcode();
3812 if (Opcode == ISD::SETCC) {
3813 if (Val->getOperand(0).getValueType() == MVT::f128)
3814 return false;
3815 CanNegate = true;
3816 MustBeFirst = false;
3817 return true;
3818 }
3819 // Protect against exponential runtime and stack overflow.
3820 if (Depth > 6)
3821 return false;
3822 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3823 bool IsOR = Opcode == ISD::OR;
3824 SDValue O0 = Val->getOperand(0);
3825 SDValue O1 = Val->getOperand(1);
3826 bool CanNegateL;
3827 bool MustBeFirstL;
3828 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3829 return false;
3830 bool CanNegateR;
3831 bool MustBeFirstR;
3832 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3833 return false;
3834
3835 if (MustBeFirstL && MustBeFirstR)
3836 return false;
3837
3838 if (IsOR) {
3839 // For an OR expression we need to be able to naturally negate at least
3840 // one side or we cannot do the transformation at all.
3841 if (!CanNegateL && !CanNegateR)
3842 return false;
3843 // If we the result of the OR will be negated and we can naturally negate
3844 // the leafs, then this sub-tree as a whole negates naturally.
3845 CanNegate = WillNegate && CanNegateL && CanNegateR;
3846 // If we cannot naturally negate the whole sub-tree, then this must be
3847 // emitted first.
3848 MustBeFirst = !CanNegate;
3849 } else {
3850 assert(Opcode == ISD::AND && "Must be OR or AND");
3851 // We cannot naturally negate an AND operation.
3852 CanNegate = false;
3853 MustBeFirst = MustBeFirstL || MustBeFirstR;
3854 }
3855 return true;
3856 }
3857 return false;
3858}
3859
3860/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3861/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3862/// Tries to transform the given i1 producing node @p Val to a series compare
3863/// and conditional compare operations. @returns an NZCV flags producing node
3864/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3865/// transformation was not possible.
3866/// \p Negate is true if we want this sub-tree being negated just by changing
3867/// SETCC conditions.
3869 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3871 // We're at a tree leaf, produce a conditional comparison operation.
3872 unsigned Opcode = Val->getOpcode();
3873 if (Opcode == ISD::SETCC) {
3874 SDValue LHS = Val->getOperand(0);
3875 SDValue RHS = Val->getOperand(1);
3876 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3877 bool isInteger = LHS.getValueType().isInteger();
3878 if (Negate)
3879 CC = getSetCCInverse(CC, LHS.getValueType());
3880 SDLoc DL(Val);
3881 // Determine OutCC and handle FP special case.
3882 if (isInteger) {
3883 OutCC = changeIntCCToAArch64CC(CC);
3884 } else {
3885 assert(LHS.getValueType().isFloatingPoint());
3886 AArch64CC::CondCode ExtraCC;
3887 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3888 // Some floating point conditions can't be tested with a single condition
3889 // code. Construct an additional comparison in this case.
3890 if (ExtraCC != AArch64CC::AL) {
3891 SDValue ExtraCmp;
3892 if (!CCOp.getNode())
3893 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3894 else
3895 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3896 ExtraCC, DL, DAG);
3897 CCOp = ExtraCmp;
3898 Predicate = ExtraCC;
3899 }
3900 }
3901
3902 // Produce a normal comparison if we are first in the chain
3903 if (!CCOp)
3904 return emitComparison(LHS, RHS, CC, DL, DAG);
3905 // Otherwise produce a ccmp.
3906 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3907 DAG);
3908 }
3909 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3910
3911 bool IsOR = Opcode == ISD::OR;
3912
3913 SDValue LHS = Val->getOperand(0);
3914 bool CanNegateL;
3915 bool MustBeFirstL;
3916 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3917 assert(ValidL && "Valid conjunction/disjunction tree");
3918 (void)ValidL;
3919
3920 SDValue RHS = Val->getOperand(1);
3921 bool CanNegateR;
3922 bool MustBeFirstR;
3923 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3924 assert(ValidR && "Valid conjunction/disjunction tree");
3925 (void)ValidR;
3926
3927 // Swap sub-tree that must come first to the right side.
3928 if (MustBeFirstL) {
3929 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3930 std::swap(LHS, RHS);
3931 std::swap(CanNegateL, CanNegateR);
3932 std::swap(MustBeFirstL, MustBeFirstR);
3933 }
3934
3935 bool NegateR;
3936 bool NegateAfterR;
3937 bool NegateL;
3938 bool NegateAfterAll;
3939 if (Opcode == ISD::OR) {
3940 // Swap the sub-tree that we can negate naturally to the left.
3941 if (!CanNegateL) {
3942 assert(CanNegateR && "at least one side must be negatable");
3943 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3944 assert(!Negate);
3945 std::swap(LHS, RHS);
3946 NegateR = false;
3947 NegateAfterR = true;
3948 } else {
3949 // Negate the left sub-tree if possible, otherwise negate the result.
3950 NegateR = CanNegateR;
3951 NegateAfterR = !CanNegateR;
3952 }
3953 NegateL = true;
3954 NegateAfterAll = !Negate;
3955 } else {
3956 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3957 assert(!Negate && "Valid conjunction/disjunction tree");
3958
3959 NegateL = false;
3960 NegateR = false;
3961 NegateAfterR = false;
3962 NegateAfterAll = false;
3963 }
3964
3965 // Emit sub-trees.
3966 AArch64CC::CondCode RHSCC;
3967 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3968 if (NegateAfterR)
3969 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3970 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3971 if (NegateAfterAll)
3972 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3973 return CmpL;
3974}
3975
3976/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3977/// In some cases this is even possible with OR operations in the expression.
3978/// See \ref AArch64CCMP.
3979/// \see emitConjunctionRec().
3981 AArch64CC::CondCode &OutCC) {
3982 bool DummyCanNegate;
3983 bool DummyMustBeFirst;
3984 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3985 return SDValue();
3986
3987 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3988}
3989
3990/// @}
3991
3992/// Returns how profitable it is to fold a comparison's operand's shift and/or
3993/// extension operations.
3995 auto isSupportedExtend = [&](SDValue V) {
3996 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3997 return true;
3998
3999 if (V.getOpcode() == ISD::AND)
4000 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4001 uint64_t Mask = MaskCst->getZExtValue();
4002 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4003 }
4004
4005 return false;
4006 };
4007
4008 if (!Op.hasOneUse())
4009 return 0;
4010
4011 if (isSupportedExtend(Op))
4012 return 1;
4013
4014 unsigned Opc = Op.getOpcode();
4015 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4016 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4017 uint64_t Shift = ShiftCst->getZExtValue();
4018 if (isSupportedExtend(Op.getOperand(0)))
4019 return (Shift <= 4) ? 2 : 1;
4020 EVT VT = Op.getValueType();
4021 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4022 return 1;
4023 }
4024
4025 return 0;
4026}
4027
4029 SDValue &AArch64cc, SelectionDAG &DAG,
4030 const SDLoc &dl) {
4031 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4032 EVT VT = RHS.getValueType();
4033 uint64_t C = RHSC->getZExtValue();
4034 if (!isLegalArithImmed(C)) {
4035 // Constant does not fit, try adjusting it by one?
4036 switch (CC) {
4037 default:
4038 break;
4039 case ISD::SETLT:
4040 case ISD::SETGE:
4041 if ((VT == MVT::i32 && C != 0x80000000 &&
4042 isLegalArithImmed((uint32_t)(C - 1))) ||
4043 (VT == MVT::i64 && C != 0x80000000ULL &&
4044 isLegalArithImmed(C - 1ULL))) {
4046 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4047 RHS = DAG.getConstant(C, dl, VT);
4048 }
4049 break;
4050 case ISD::SETULT:
4051 case ISD::SETUGE:
4052 if ((VT == MVT::i32 && C != 0 &&
4053 isLegalArithImmed((uint32_t)(C - 1))) ||
4054 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
4056 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4057 RHS = DAG.getConstant(C, dl, VT);
4058 }
4059 break;
4060 case ISD::SETLE:
4061 case ISD::SETGT:
4062 if ((VT == MVT::i32 && C != INT32_MAX &&
4063 isLegalArithImmed((uint32_t)(C + 1))) ||
4064 (VT == MVT::i64 && C != INT64_MAX &&
4065 isLegalArithImmed(C + 1ULL))) {
4067 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4068 RHS = DAG.getConstant(C, dl, VT);
4069 }
4070 break;
4071 case ISD::SETULE:
4072 case ISD::SETUGT:
4073 if ((VT == MVT::i32 && C != UINT32_MAX &&
4074 isLegalArithImmed((uint32_t)(C + 1))) ||
4075 (VT == MVT::i64 && C != UINT64_MAX &&
4076 isLegalArithImmed(C + 1ULL))) {
4078 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4079 RHS = DAG.getConstant(C, dl, VT);
4080 }
4081 break;
4082 }
4083 }
4084 }
4085
4086 // Comparisons are canonicalized so that the RHS operand is simpler than the
4087 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4088 // can fold some shift+extend operations on the RHS operand, so swap the
4089 // operands if that can be done.
4090 //
4091 // For example:
4092 // lsl w13, w11, #1
4093 // cmp w13, w12
4094 // can be turned into:
4095 // cmp w12, w11, lsl #1
4096 if (!isa<ConstantSDNode>(RHS) ||
4097 !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
4098 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4099 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4100 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4101 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4102
4103 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4104 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4105 std::swap(LHS, RHS);
4107 }
4108 }
4109
4110 SDValue Cmp;
4111 AArch64CC::CondCode AArch64CC;
4112 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4113 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
4114
4115 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4116 // For the i8 operand, the largest immediate is 255, so this can be easily
4117 // encoded in the compare instruction. For the i16 operand, however, the
4118 // largest immediate cannot be encoded in the compare.
4119 // Therefore, use a sign extending load and cmn to avoid materializing the
4120 // -1 constant. For example,
4121 // movz w1, #65535
4122 // ldrh w0, [x0, #0]
4123 // cmp w0, w1
4124 // >
4125 // ldrsh w0, [x0, #0]
4126 // cmn w0, #1
4127 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4128 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4129 // ensure both the LHS and RHS are truly zero extended and to make sure the
4130 // transformation is profitable.
4131 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4132 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4133 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4134 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4135 int16_t ValueofRHS = RHS->getAsZExtVal();
4136 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4137 SDValue SExt =
4138 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
4139 DAG.getValueType(MVT::i16));
4140 Cmp = emitComparison(
4141 SExt, DAG.getSignedConstant(ValueofRHS, dl, RHS.getValueType()), CC,
4142 dl, DAG);
4143 AArch64CC = changeIntCCToAArch64CC(CC);
4144 }
4145 }
4146
4147 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4148 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4149 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4150 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
4151 }
4152 }
4153 }
4154
4155 if (!Cmp) {
4156 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4157 AArch64CC = changeIntCCToAArch64CC(CC);
4158 }
4159 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
4160 return Cmp;
4161}
4162
4163static std::pair<SDValue, SDValue>
4165 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4166 "Unsupported value type");
4167 SDValue Value, Overflow;
4168 SDLoc DL(Op);
4169 SDValue LHS = Op.getOperand(0);
4170 SDValue RHS = Op.getOperand(1);
4171 unsigned Opc = 0;
4172 switch (Op.getOpcode()) {
4173 default:
4174 llvm_unreachable("Unknown overflow instruction!");
4175 case ISD::SADDO:
4176 Opc = AArch64ISD::ADDS;
4177 CC = AArch64CC::VS;
4178 break;
4179 case ISD::UADDO:
4180 Opc = AArch64ISD::ADDS;
4181 CC = AArch64CC::HS;
4182 break;
4183 case ISD::SSUBO:
4184 Opc = AArch64ISD::SUBS;
4185 CC = AArch64CC::VS;
4186 break;
4187 case ISD::USUBO:
4188 Opc = AArch64ISD::SUBS;
4189 CC = AArch64CC::LO;
4190 break;
4191 // Multiply needs a little bit extra work.
4192 case ISD::SMULO:
4193 case ISD::UMULO: {
4194 CC = AArch64CC::NE;
4195 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4196 if (Op.getValueType() == MVT::i32) {
4197 // Extend to 64-bits, then perform a 64-bit multiply.
4198 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4199 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4200 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4201 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4202 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4203
4204 // Check that the result fits into a 32-bit integer.
4205 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
4206 if (IsSigned) {
4207 // cmp xreg, wreg, sxtw
4208 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4209 Overflow =
4210 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4211 } else {
4212 // tst xreg, #0xffffffff00000000
4213 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4214 Overflow =
4215 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4216 }
4217 break;
4218 }
4219 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4220 // For the 64 bit multiply
4221 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4222 if (IsSigned) {
4223 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4224 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4225 DAG.getConstant(63, DL, MVT::i64));
4226 // It is important that LowerBits is last, otherwise the arithmetic
4227 // shift will not be folded into the compare (SUBS).
4228 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4229 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4230 .getValue(1);
4231 } else {
4232 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4233 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4234 Overflow =
4235 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4236 DAG.getConstant(0, DL, MVT::i64),
4237 UpperBits).getValue(1);
4238 }
4239 break;
4240 }
4241 } // switch (...)
4242
4243 if (Opc) {
4244 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4245
4246 // Emit the AArch64 operation with overflow check.
4247 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4248 Overflow = Value.getValue(1);
4249 }
4250 return std::make_pair(Value, Overflow);
4251}
4252
4253SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4254 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4255 !Subtarget->isNeonAvailable()))
4256 return LowerToScalableOp(Op, DAG);
4257
4258 SDValue Sel = Op.getOperand(0);
4259 SDValue Other = Op.getOperand(1);
4260 SDLoc dl(Sel);
4261
4262 // If the operand is an overflow checking operation, invert the condition
4263 // code and kill the Not operation. I.e., transform:
4264 // (xor (overflow_op_bool, 1))
4265 // -->
4266 // (csel 1, 0, invert(cc), overflow_op_bool)
4267 // ... which later gets transformed to just a cset instruction with an
4268 // inverted condition code, rather than a cset + eor sequence.
4270 // Only lower legal XALUO ops.
4272 return SDValue();
4273
4274 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4275 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4277 SDValue Value, Overflow;
4278 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4279 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4280 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4281 CCVal, Overflow);
4282 }
4283 // If neither operand is a SELECT_CC, give up.
4284 if (Sel.getOpcode() != ISD::SELECT_CC)
4285 std::swap(Sel, Other);
4286 if (Sel.getOpcode() != ISD::SELECT_CC)
4287 return Op;
4288
4289 // The folding we want to perform is:
4290 // (xor x, (select_cc a, b, cc, 0, -1) )
4291 // -->
4292 // (csel x, (xor x, -1), cc ...)
4293 //
4294 // The latter will get matched to a CSINV instruction.
4295
4296 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4297 SDValue LHS = Sel.getOperand(0);
4298 SDValue RHS = Sel.getOperand(1);
4299 SDValue TVal = Sel.getOperand(2);
4300 SDValue FVal = Sel.getOperand(3);
4301
4302 // FIXME: This could be generalized to non-integer comparisons.
4303 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4304 return Op;
4305
4306 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4307 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4308
4309 // The values aren't constants, this isn't the pattern we're looking for.
4310 if (!CFVal || !CTVal)
4311 return Op;
4312
4313 // We can commute the SELECT_CC by inverting the condition. This
4314 // might be needed to make this fit into a CSINV pattern.
4315 if (CTVal->isAllOnes() && CFVal->isZero()) {
4316 std::swap(TVal, FVal);
4317 std::swap(CTVal, CFVal);
4318 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4319 }
4320
4321 // If the constants line up, perform the transform!
4322 if (CTVal->isZero() && CFVal->isAllOnes()) {
4323 SDValue CCVal;
4324 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4325
4326 FVal = Other;
4327 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4328 DAG.getAllOnesConstant(dl, Other.getValueType()));
4329
4330 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4331 CCVal, Cmp);
4332 }
4333
4334 return Op;
4335}
4336
4337// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4338// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4339// sets 'C' bit to 0.
4341 SDLoc DL(Value);
4342 EVT VT = Value.getValueType();
4343 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4344 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4345 SDValue Cmp =
4346 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4347 return Cmp.getValue(1);
4348}
4349
4350// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4351// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4353 bool Invert) {
4354 assert(Glue.getResNo() == 1);
4355 SDLoc DL(Glue);
4356 SDValue Zero = DAG.getConstant(0, DL, VT);
4357 SDValue One = DAG.getConstant(1, DL, VT);
4358 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4359 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4360 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4361}
4362
4363// Value is 1 if 'V' bit of NZCV is 1, else 0
4365 assert(Glue.getResNo() == 1);
4366 SDLoc DL(Glue);
4367 SDValue Zero = DAG.getConstant(0, DL, VT);
4368 SDValue One = DAG.getConstant(1, DL, VT);
4369 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4370 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4371}
4372
4373// This lowering is inefficient, but it will get cleaned up by
4374// `foldOverflowCheck`
4376 unsigned Opcode, bool IsSigned) {
4377 EVT VT0 = Op.getValue(0).getValueType();
4378 EVT VT1 = Op.getValue(1).getValueType();
4379
4380 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4381 return SDValue();
4382
4383 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4384 SDValue OpLHS = Op.getOperand(0);
4385 SDValue OpRHS = Op.getOperand(1);
4386 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4387
4388 SDLoc DL(Op);
4389 SDVTList VTs = DAG.getVTList(VT0, VT1);
4390
4391 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4392 OpRHS, OpCarryIn);
4393
4394 SDValue OutFlag =
4395 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4396 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4397
4398 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4399}
4400
4402 // Let legalize expand this if it isn't a legal type yet.
4403 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4404 return SDValue();
4405
4406 SDLoc dl(Op);
4408 // The actual operation that sets the overflow or carry flag.
4409 SDValue Value, Overflow;
4410 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4411
4412 // We use 0 and 1 as false and true values.
4413 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4414 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4415
4416 // We use an inverted condition, because the conditional select is inverted
4417 // too. This will allow it to be selected to a single instruction:
4418 // CSINC Wd, WZR, WZR, invert(cond).
4419 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4420 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4421 CCVal, Overflow);
4422
4423 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4424 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4425}
4426
4427// Prefetch operands are:
4428// 1: Address to prefetch
4429// 2: bool isWrite
4430// 3: int locality (0 = no locality ... 3 = extreme locality)
4431// 4: bool isDataCache
4433 SDLoc DL(Op);
4434 unsigned IsWrite = Op.getConstantOperandVal(2);
4435 unsigned Locality = Op.getConstantOperandVal(3);
4436 unsigned IsData = Op.getConstantOperandVal(4);
4437
4438 bool IsStream = !Locality;
4439 // When the locality number is set
4440 if (Locality) {
4441 // The front-end should have filtered out the out-of-range values
4442 assert(Locality <= 3 && "Prefetch locality out-of-range");
4443 // The locality degree is the opposite of the cache speed.
4444 // Put the number the other way around.
4445 // The encoding starts at 0 for level 1
4446 Locality = 3 - Locality;
4447 }
4448
4449 // built the mask value encoding the expected behavior.
4450 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4451 (!IsData << 3) | // IsDataCache bit
4452 (Locality << 1) | // Cache level bits
4453 (unsigned)IsStream; // Stream bit
4454 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4455 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4456 Op.getOperand(1));
4457}
4458
4459// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4460// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4461// (AND X Y) Z which produces a better opt with EmitComparison
4463 SelectionDAG &DAG, const SDLoc dl) {
4464 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4465 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4466 ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4467 if (LHSConstOp && RHSConst) {
4468 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4469 uint64_t RHSConstant = RHSConst->getZExtValue();
4470 if (isPowerOf2_64(RHSConstant)) {
4471 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4472 LHS =
4473 DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0),
4474 DAG.getConstant(NewMaskValue, dl, LHS.getValueType()));
4475 RHS = DAG.getConstant(0, dl, RHS.getValueType());
4476 CC = ISD::SETEQ;
4477 }
4478 }
4479 }
4480}
4481
4482SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4483 SelectionDAG &DAG) const {
4484 EVT VT = Op.getValueType();
4485 if (VT.isScalableVector()) {
4486 SDValue SrcVal = Op.getOperand(0);
4487
4488 if (SrcVal.getValueType().getScalarType() == MVT::bf16) {
4489 // bf16 and f32 share the same exponent range so the conversion requires
4490 // them to be aligned with the new mantissa bits zero'd. This is just a
4491 // left shift that is best to isel directly.
4492 if (VT == MVT::nxv2f32 || VT == MVT::nxv4f32)
4493 return Op;
4494
4495 if (VT != MVT::nxv2f64)
4496 return SDValue();
4497
4498 // Break other conversions in two with the first part converting to f32
4499 // and the second using native f32->VT instructions.
4500 SDLoc DL(Op);
4501 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4502 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4503 }
4504
4505 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4506 }
4507
4508 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4509 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4510
4511 bool IsStrict = Op->isStrictFPOpcode();
4512 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4513 EVT Op0VT = Op0.getValueType();
4514 if (VT == MVT::f64) {
4515 // FP16->FP32 extends are legal for v32 and v4f32.
4516 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4517 return Op;
4518 // Split bf16->f64 extends into two fpextends.
4519 if (Op0VT == MVT::bf16 && IsStrict) {
4520 SDValue Ext1 =
4521 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4522 {Op0, Op.getOperand(0)});
4523 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4524 {Ext1, Ext1.getValue(1)});
4525 }
4526 if (Op0VT == MVT::bf16)
4527 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4528 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4529 return SDValue();
4530 }
4531
4532 if (VT.getScalarType() == MVT::f32) {
4533 // FP16->FP32 extends are legal for v32 and v4f32.
4534 if (Op0VT.getScalarType() == MVT::f16)
4535 return Op;
4536 if (Op0VT.getScalarType() == MVT::bf16) {
4537 SDLoc DL(Op);
4538 EVT IVT = VT.changeTypeToInteger();
4539 if (!Op0VT.isVector()) {
4540 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
4541 IVT = MVT::v4i32;
4542 }
4543
4544 EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
4545 SDValue Ext =
4546 DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
4547 SDValue Shift =
4548 DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
4549 if (!Op0VT.isVector())
4550 Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
4551 DAG.getConstant(0, DL, MVT::i64));
4552 Shift = DAG.getBitcast(VT, Shift);
4553 return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
4554 : Shift;
4555 }
4556 return SDValue();
4557 }
4558
4559 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4560 return SDValue();
4561}
4562
4563SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4564 SelectionDAG &DAG) const {
4565 EVT VT = Op.getValueType();
4566 bool IsStrict = Op->isStrictFPOpcode();
4567 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4568 EVT SrcVT = SrcVal.getValueType();
4569 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4570
4571 if (VT.isScalableVector()) {
4572 if (VT.getScalarType() != MVT::bf16)
4573 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4574
4575 SDLoc DL(Op);
4576 constexpr EVT I32 = MVT::nxv4i32;
4577 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4578
4579 SDValue NaN;
4580 SDValue Narrow;
4581
4582 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4583 if (Subtarget->hasBF16())
4584 return LowerToPredicatedOp(Op, DAG,
4586
4587 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4588
4589 // Set the quiet bit.
4590 if (!DAG.isKnownNeverSNaN(SrcVal))
4591 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4592 } else if (SrcVT == MVT::nxv2f64 &&
4593 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4594 // Round to float without introducing rounding errors and try again.
4595 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4596 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4597 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4598
4600 if (IsStrict)
4601 NewOps.push_back(Op.getOperand(0));
4602 NewOps.push_back(Narrow);
4603 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4604 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4605 } else
4606 return SDValue();
4607
4608 if (!Trunc) {
4609 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4610 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4611 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4612 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4613 }
4614
4615 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4616 // 0x80000000.
4617 if (NaN) {
4618 EVT I1 = I32.changeElementType(MVT::i1);
4619 EVT CondVT = VT.changeElementType(MVT::i1);
4620 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4621 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4622 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4623 }
4624
4625 // Now that we have rounded, shift the bits into position.
4626 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4627 return getSVESafeBitCast(VT, Narrow, DAG);
4628 }
4629
4630 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4631 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4632
4633 // Expand cases where the result type is BF16 but we don't have hardware
4634 // instructions to lower it.
4635 if (VT.getScalarType() == MVT::bf16 &&
4636 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4637 Subtarget->hasBF16())) {
4638 SDLoc dl(Op);
4639 SDValue Narrow = SrcVal;
4640 SDValue NaN;
4641 EVT I32 = SrcVT.changeElementType(MVT::i32);
4642 EVT F32 = SrcVT.changeElementType(MVT::f32);
4643 if (SrcVT.getScalarType() == MVT::f32) {
4644 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4645 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4646 if (!NeverSNaN) {
4647 // Set the quiet bit.
4648 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4649 DAG.getConstant(0x400000, dl, I32));
4650 }
4651 } else if (SrcVT.getScalarType() == MVT::f64) {
4652 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4653 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4654 } else {
4655 return SDValue();
4656 }
4657 if (!Trunc) {
4658 SDValue One = DAG.getConstant(1, dl, I32);
4659 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4660 DAG.getShiftAmountConstant(16, I32, dl));
4661 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4662 SDValue RoundingBias =
4663 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4664 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4665 }
4666
4667 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4668 // 0x80000000.
4669 if (NaN) {
4670 SDValue IsNaN = DAG.getSetCC(
4671 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4672 SrcVal, SrcVal, ISD::SETUO);
4673 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4674 }
4675
4676 // Now that we have rounded, shift the bits into position.
4677 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4678 DAG.getShiftAmountConstant(16, I32, dl));
4679 if (VT.isVector()) {
4680 EVT I16 = I32.changeVectorElementType(MVT::i16);
4681 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4682 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4683 }
4684 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4685 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4686 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4687 : Result;
4688 }
4689
4690 if (SrcVT != MVT::f128) {
4691 // Expand cases where the input is a vector bigger than NEON.
4693 return SDValue();
4694
4695 // It's legal except when f128 is involved
4696 return Op;
4697 }
4698
4699 return SDValue();
4700}
4701
4702SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4703 SelectionDAG &DAG) const {
4704 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4705 // Any additional optimization in this function should be recorded
4706 // in the cost tables.
4707 bool IsStrict = Op->isStrictFPOpcode();
4708 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4709 EVT VT = Op.getValueType();
4710
4711 if (VT.isScalableVector()) {
4712 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4715 return LowerToPredicatedOp(Op, DAG, Opcode);
4716 }
4717
4718 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4719 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4720 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4721
4722 unsigned NumElts = InVT.getVectorNumElements();
4723
4724 // f16 conversions are promoted to f32 when full fp16 is not supported.
4725 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4726 InVT.getVectorElementType() == MVT::bf16) {
4727 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4728 SDLoc dl(Op);
4729 if (IsStrict) {
4730 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4731 {Op.getOperand(0), Op.getOperand(1)});
4732 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4733 {Ext.getValue(1), Ext.getValue(0)});
4734 }
4735 return DAG.getNode(
4736 Op.getOpcode(), dl, Op.getValueType(),
4737 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4738 }
4739
4740 uint64_t VTSize = VT.getFixedSizeInBits();
4741 uint64_t InVTSize = InVT.getFixedSizeInBits();
4742 if (VTSize < InVTSize) {
4743 SDLoc dl(Op);
4744 if (IsStrict) {
4746 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4747 {Op.getOperand(0), Op.getOperand(1)});
4748 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4749 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4750 }
4751 SDValue Cv =
4752 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4753 Op.getOperand(0));
4754 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4755 }
4756
4757 if (VTSize > InVTSize) {
4758 SDLoc dl(Op);
4759 MVT ExtVT =
4762 if (IsStrict) {
4763 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4764 {Op.getOperand(0), Op.getOperand(1)});
4765 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4766 {Ext.getValue(1), Ext.getValue(0)});
4767 }
4768 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4769 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4770 }
4771
4772 // Use a scalar operation for conversions between single-element vectors of
4773 // the same size.
4774 if (NumElts == 1) {
4775 SDLoc dl(Op);
4776 SDValue Extract = DAG.getNode(
4778 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4779 EVT ScalarVT = VT.getScalarType();
4780 if (IsStrict)
4781 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4782 {Op.getOperand(0), Extract});
4783 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4784 }
4785
4786 // Type changing conversions are illegal.
4787 return Op;
4788}
4789
4790SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4791 SelectionDAG &DAG) const {
4792 bool IsStrict = Op->isStrictFPOpcode();
4793 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4794
4795 if (SrcVal.getValueType().isVector())
4796 return LowerVectorFP_TO_INT(Op, DAG);
4797
4798 // f16 conversions are promoted to f32 when full fp16 is not supported.
4799 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4800 SrcVal.getValueType() == MVT::bf16) {
4801 SDLoc dl(Op);
4802 if (IsStrict) {
4803 SDValue Ext =
4804 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4805 {Op.getOperand(0), SrcVal});
4806 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4807 {Ext.getValue(1), Ext.getValue(0)});
4808 }
4809 return DAG.getNode(
4810 Op.getOpcode(), dl, Op.getValueType(),
4811 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4812 }
4813
4814 if (SrcVal.getValueType() != MVT::f128) {
4815 // It's legal except when f128 is involved
4816 return Op;
4817 }
4818
4819 return SDValue();
4820}
4821
4822SDValue
4823AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4824 SelectionDAG &DAG) const {
4825 // AArch64 FP-to-int conversions saturate to the destination element size, so
4826 // we can lower common saturating conversions to simple instructions.
4827 SDValue SrcVal = Op.getOperand(0);
4828 EVT SrcVT = SrcVal.getValueType();
4829 EVT DstVT = Op.getValueType();
4830 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4831
4832 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4833 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4834 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4835 assert(SatWidth <= DstElementWidth &&
4836 "Saturation width cannot exceed result width");
4837
4838 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4839 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4840 // types, so this is hard to reach.
4841 if (DstVT.isScalableVector())
4842 return SDValue();
4843
4844 EVT SrcElementVT = SrcVT.getVectorElementType();
4845
4846 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4847 SDLoc DL(Op);
4848 SDValue SrcVal2;
4849 if ((SrcElementVT == MVT::f16 &&
4850 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4851 SrcElementVT == MVT::bf16) {
4852 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4853 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4854 // If we are extending to a v8f32, split into two v4f32 to produce legal
4855 // types.
4856 if (F32VT.getSizeInBits() > 128) {
4857 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4858 F32VT = F32VT.getHalfNumVectorElementsVT();
4859 }
4860 SrcVT = F32VT;
4861 SrcElementVT = MVT::f32;
4862 SrcElementWidth = 32;
4863 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4864 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4865 return SDValue();
4866
4867 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4868 // width and produce a fcvtzu.
4869 if (SatWidth == 64 && SrcElementWidth < 64) {
4870 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4871 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4872 SrcVT = F64VT;
4873 SrcElementVT = MVT::f64;
4874 SrcElementWidth = 64;
4875 }
4876 // Cases that we can emit directly.
4877 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4878 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4879 DAG.getValueType(DstVT.getScalarType()));
4880 if (SrcVal2) {
4881 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4882 DAG.getValueType(DstVT.getScalarType()));
4883 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4884 }
4885 return Res;
4886 }
4887
4888 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4889 // result. This is only valid if the legal cvt is larger than the saturate
4890 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4891 // (at least until sqxtn is selected).
4892 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4893 return SDValue();
4894
4895 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4896 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4897 DAG.getValueType(IntVT.getScalarType()));
4898 SDValue NativeCvt2 =
4899 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4900 DAG.getValueType(IntVT.getScalarType()))
4901 : SDValue();
4902 SDValue Sat, Sat2;
4903 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4904 SDValue MinC = DAG.getConstant(
4905 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4906 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4907 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4908 SDValue MaxC = DAG.getConstant(
4909 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4910 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4911 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4912 } else {
4913 SDValue MinC = DAG.getConstant(
4914 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4915 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4916 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4917 }
4918
4919 if (SrcVal2)
4920 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4922 Sat, Sat2);
4923
4924 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4925}
4926
4927SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4928 SelectionDAG &DAG) const {
4929 // AArch64 FP-to-int conversions saturate to the destination register size, so
4930 // we can lower common saturating conversions to simple instructions.
4931 SDValue SrcVal = Op.getOperand(0);
4932 EVT SrcVT = SrcVal.getValueType();
4933
4934 if (SrcVT.isVector())
4935 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4936
4937 EVT DstVT = Op.getValueType();
4938 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4939 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4940 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4941 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4942
4943 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4944 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4945 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4946 SrcVT = MVT::f32;
4947 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4948 SrcVT != MVT::bf16)
4949 return SDValue();
4950
4951 SDLoc DL(Op);
4952 // Cases that we can emit directly.
4953 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4954 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4955 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4956 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4957 DAG.getValueType(DstVT));
4958
4959 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4960 // result. This is only valid if the legal cvt is larger than the saturate
4961 // width.
4962 if (DstWidth < SatWidth)
4963 return SDValue();
4964
4965 SDValue NativeCvt =
4966 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4967 SDValue Sat;
4968 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4969 SDValue MinC = DAG.getConstant(
4970 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4971 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4972 SDValue MaxC = DAG.getConstant(
4973 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4974 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4975 } else {
4976 SDValue MinC = DAG.getConstant(
4977 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4978 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4979 }
4980
4981 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4982}
4983
4984SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4985 SelectionDAG &DAG) const {
4986 EVT VT = Op.getValueType();
4987 SDValue Src = Op.getOperand(0);
4988 SDLoc DL(Op);
4989
4990 assert(VT.isVector() && "Expected vector type");
4991
4992 EVT CastVT =
4993 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4994
4995 // Round the floating-point value into a floating-point register with the
4996 // current rounding mode.
4997 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4998
4999 // Truncate the rounded floating point to an integer.
5000 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5002}
5003
5004SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5005 SelectionDAG &DAG) const {
5006 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5007 // Any additional optimization in this function should be recorded
5008 // in the cost tables.
5009 bool IsStrict = Op->isStrictFPOpcode();
5010 EVT VT = Op.getValueType();
5011 SDLoc dl(Op);
5012 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5013 EVT InVT = In.getValueType();
5014 unsigned Opc = Op.getOpcode();
5015 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5016
5017 if (VT.isScalableVector()) {
5018 if (InVT.getVectorElementType() == MVT::i1) {
5019 // We can't directly extend an SVE predicate; extend it first.
5020 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5021 EVT CastVT = getPromotedVTForPredicate(InVT);
5022 In = DAG.getNode(CastOpc, dl, CastVT, In);
5023 return DAG.getNode(Opc, dl, VT, In);
5024 }
5025
5026 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5028 return LowerToPredicatedOp(Op, DAG, Opcode);
5029 }
5030
5031 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5032 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5033 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5034
5035 // Promote bf16 conversions to f32.
5036 if (VT.getVectorElementType() == MVT::bf16) {
5037 EVT F32 = VT.changeElementType(MVT::f32);
5038 if (IsStrict) {
5039 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
5040 {Op.getOperand(0), In});
5041 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5042 {Op.getValueType(), MVT::Other},
5043 {Val.getValue(1), Val.getValue(0),
5044 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5045 }
5046 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5047 DAG.getNode(Op.getOpcode(), dl, F32, In),
5048 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5049 }
5050
5051 uint64_t VTSize = VT.getFixedSizeInBits();
5052 uint64_t InVTSize = InVT.getFixedSizeInBits();
5053 if (VTSize < InVTSize) {
5054 MVT CastVT =
5056 InVT.getVectorNumElements());
5057 if (IsStrict) {
5058 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
5059 {Op.getOperand(0), In});
5060 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
5061 {In.getValue(1), In.getValue(0),
5062 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5063 }
5064 In = DAG.getNode(Opc, dl, CastVT, In);
5065 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
5066 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5067 }
5068
5069 if (VTSize > InVTSize) {
5070 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5072 In = DAG.getNode(CastOpc, dl, CastVT, In);
5073 if (IsStrict)
5074 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
5075 return DAG.getNode(Opc, dl, VT, In);
5076 }
5077
5078 // Use a scalar operation for conversions between single-element vectors of
5079 // the same size.
5080 if (VT.getVectorNumElements() == 1) {
5081 SDValue Extract = DAG.getNode(
5083 In, DAG.getConstant(0, dl, MVT::i64));
5084 EVT ScalarVT = VT.getScalarType();
5085 if (IsStrict)
5086 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
5087 {Op.getOperand(0), Extract});
5088 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
5089 }
5090
5091 return Op;
5092}
5093
5094SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5095 SelectionDAG &DAG) const {
5096 if (Op.getValueType().isVector())
5097 return LowerVectorINT_TO_FP(Op, DAG);
5098
5099 bool IsStrict = Op->isStrictFPOpcode();
5100 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5101
5102 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5103 Op->getOpcode() == ISD::SINT_TO_FP;
5104
5105 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5106 SDLoc dl(Op);
5107 if (IsStrict) {
5108 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
5109 {Op.getOperand(0), SrcVal});
5110 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5111 {Op.getValueType(), MVT::Other},
5112 {Val.getValue(1), Val.getValue(0),
5113 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5114 }
5115 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5116 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
5117 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5118 };
5119
5120 if (Op.getValueType() == MVT::bf16) {
5121 unsigned MaxWidth = IsSigned
5122 ? DAG.ComputeMaxSignificantBits(SrcVal)
5123 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5124 // bf16 conversions are promoted to f32 when converting from i16.
5125 if (MaxWidth <= 24) {
5126 return IntToFpViaPromotion(MVT::f32);
5127 }
5128
5129 // bf16 conversions are promoted to f64 when converting from i32.
5130 if (MaxWidth <= 53) {
5131 return IntToFpViaPromotion(MVT::f64);
5132 }
5133
5134 // We need to be careful about i64 -> bf16.
5135 // Consider an i32 22216703.
5136 // This number cannot be represented exactly as an f32 and so a itofp will
5137 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5138 // However, the correct bf16 was supposed to be 22151168.0
5139 // We need to use sticky rounding to get this correct.
5140 if (SrcVal.getValueType() == MVT::i64) {
5141 SDLoc DL(Op);
5142 // This algorithm is equivalent to the following:
5143 // uint64_t SrcHi = SrcVal & ~0xfffull;
5144 // uint64_t SrcLo = SrcVal & 0xfffull;
5145 // uint64_t Highest = SrcVal >> 53;
5146 // bool HasHighest = Highest != 0;
5147 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5148 // double Rounded = static_cast<double>(ToRound);
5149 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5150 // uint64_t HasLo = SrcLo != 0;
5151 // bool NeedsAdjustment = HasHighest & HasLo;
5152 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5153 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5154 // return static_cast<__bf16>(Adjusted);
5155 //
5156 // Essentially, what happens is that SrcVal either fits perfectly in a
5157 // double-precision value or it is too big. If it is sufficiently small,
5158 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5159 // ensure that u64 -> double has no rounding error by only using the 52
5160 // MSB of the input. The low order bits will get merged into a sticky bit
5161 // which will avoid issues incurred by double rounding.
5162
5163 // Signed conversion is more or less like so:
5164 // copysign((__bf16)abs(SrcVal), SrcVal)
5165 SDValue SignBit;
5166 if (IsSigned) {
5167 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5168 DAG.getConstant(1ull << 63, DL, MVT::i64));
5169 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5170 }
5171 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5172 DAG.getConstant(~0xfffull, DL, MVT::i64));
5173 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5174 DAG.getConstant(0xfffull, DL, MVT::i64));
5176 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5177 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5178 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5179 SDValue ToRound =
5180 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5181 SDValue Rounded =
5182 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5183 {Op.getOperand(0), ToRound})
5184 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5185
5186 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5187 if (SignBit) {
5188 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5189 }
5190
5191 SDValue HasHighest = DAG.getSetCC(
5192 DL,
5193 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5194 Highest, Zero64, ISD::SETNE);
5195
5196 SDValue HasLo = DAG.getSetCC(
5197 DL,
5198 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5199 SrcLo, Zero64, ISD::SETNE);
5200
5201 SDValue NeedsAdjustment =
5202 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5203 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5204
5205 SDValue AdjustedBits =
5206 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5207 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5208 return IsStrict
5209 ? DAG.getNode(
5211 {Op.getValueType(), MVT::Other},
5212 {Rounded.getValue(1), Adjusted,
5213 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5214 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5215 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5216 }
5217 }
5218
5219 // f16 conversions are promoted to f32 when full fp16 is not supported.
5220 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5221 return IntToFpViaPromotion(MVT::f32);
5222 }
5223
5224 // i128 conversions are libcalls.
5225 if (SrcVal.getValueType() == MVT::i128)
5226 return SDValue();
5227
5228 // Other conversions are legal, unless it's to the completely software-based
5229 // fp128.
5230 if (Op.getValueType() != MVT::f128)
5231 return Op;
5232 return SDValue();
5233}
5234
5235SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5236 SelectionDAG &DAG) const {
5237 // For iOS, we want to call an alternative entry point: __sincos_stret,
5238 // which returns the values in two S / D registers.
5239 SDLoc dl(Op);
5240 SDValue Arg = Op.getOperand(0);
5241 EVT ArgVT = Arg.getValueType();
5242 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5243
5245 ArgListEntry Entry;
5246
5247 Entry.Node = Arg;
5248 Entry.Ty = ArgTy;
5249 Entry.IsSExt = false;
5250 Entry.IsZExt = false;
5251 Args.push_back(Entry);
5252
5253 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5254 : RTLIB::SINCOS_STRET_F32;
5255 const char *LibcallName = getLibcallName(LC);
5256 SDValue Callee =
5257 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5258
5259 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5261 CLI.setDebugLoc(dl)
5262 .setChain(DAG.getEntryNode())
5263 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
5264
5265 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5266 return CallResult.first;
5267}
5268
5269static MVT getSVEContainerType(EVT ContentTy);
5270
5271SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5272 SelectionDAG &DAG) const {
5273 EVT OpVT = Op.getValueType();
5274 EVT ArgVT = Op.getOperand(0).getValueType();
5275
5277 return LowerFixedLengthBitcastToSVE(Op, DAG);
5278
5279 if (OpVT.isScalableVector()) {
5280 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5281
5282 // Handle type legalisation first.
5283 if (!isTypeLegal(ArgVT)) {
5284 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5285 "Expected int->fp bitcast!");
5286
5287 // Bitcasting between unpacked vector types of different element counts is
5288 // not a NOP because the live elements are laid out differently.
5289 // 01234567
5290 // e.g. nxv2i32 = XX??XX??
5291 // nxv4f16 = X?X?X?X?
5292 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5293 return SDValue();
5294
5295 SDValue ExtResult =
5297 Op.getOperand(0));
5298 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5299 }
5300
5301 // Bitcasts between legal types with the same element count are legal.
5302 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5303 return Op;
5304
5305 // getSVESafeBitCast does not support casting between unpacked types.
5306 if (!isPackedVectorType(OpVT, DAG))
5307 return SDValue();
5308
5309 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5310 }
5311
5312 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5313 return SDValue();
5314
5315 // Bitcasts between f16 and bf16 are legal.
5316 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5317 return Op;
5318
5319 assert(ArgVT == MVT::i16);
5320 SDLoc DL(Op);
5321
5322 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5323 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5324 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5325}
5326
5327// Returns lane if Op extracts from a two-element vector and lane is constant
5328// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5329static std::optional<uint64_t>
5331 SDNode *OpNode = Op.getNode();
5332 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5333 return std::nullopt;
5334
5335 EVT VT = OpNode->getOperand(0).getValueType();
5336 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
5337 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5338 return std::nullopt;
5339
5340 return C->getZExtValue();
5341}
5342
5344 bool isSigned) {
5345 EVT VT = N.getValueType();
5346
5347 if (N.getOpcode() != ISD::BUILD_VECTOR)
5348 return false;
5349
5350 for (const SDValue &Elt : N->op_values()) {
5351 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5352 unsigned EltSize = VT.getScalarSizeInBits();
5353 unsigned HalfSize = EltSize / 2;
5354 if (isSigned) {
5355 if (!isIntN(HalfSize, C->getSExtValue()))
5356 return false;
5357 } else {
5358 if (!isUIntN(HalfSize, C->getZExtValue()))
5359 return false;
5360 }
5361 continue;
5362 }
5363 return false;
5364 }
5365
5366 return true;
5367}
5368
5370 EVT VT = N.getValueType();
5371 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5372 EVT HalfVT = EVT::getVectorVT(
5373 *DAG.getContext(),
5376 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5377}
5378
5380 return N.getOpcode() == ISD::SIGN_EXTEND ||
5381 N.getOpcode() == ISD::ANY_EXTEND ||
5382 isExtendedBUILD_VECTOR(N, DAG, true);
5383}
5384
5386 return N.getOpcode() == ISD::ZERO_EXTEND ||
5387 N.getOpcode() == ISD::ANY_EXTEND ||
5388 isExtendedBUILD_VECTOR(N, DAG, false);
5389}
5390
5392 unsigned Opcode = N.getOpcode();
5393 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5394 SDValue N0 = N.getOperand(0);
5395 SDValue N1 = N.getOperand(1);
5396 return N0->hasOneUse() && N1->hasOneUse() &&
5397 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5398 }
5399 return false;
5400}
5401
5403 unsigned Opcode = N.getOpcode();
5404 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5405 SDValue N0 = N.getOperand(0);
5406 SDValue N1 = N.getOperand(1);
5407 return N0->hasOneUse() && N1->hasOneUse() &&
5408 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5409 }
5410 return false;
5411}
5412
5413SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5414 SelectionDAG &DAG) const {
5415 // The rounding mode is in bits 23:22 of the FPSCR.
5416 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5417 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5418 // so that the shift + and get folded into a bitfield extract.
5419 SDLoc dl(Op);
5420
5421 SDValue Chain = Op.getOperand(0);
5422 SDValue FPCR_64 = DAG.getNode(
5423 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5424 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5425 Chain = FPCR_64.getValue(1);
5426 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5427 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5428 DAG.getConstant(1U << 22, dl, MVT::i32));
5429 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5430 DAG.getConstant(22, dl, MVT::i32));
5431 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5432 DAG.getConstant(3, dl, MVT::i32));
5433 return DAG.getMergeValues({AND, Chain}, dl);
5434}
5435
5436SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5437 SelectionDAG &DAG) const {
5438 SDLoc DL(Op);
5439 SDValue Chain = Op->getOperand(0);
5440 SDValue RMValue = Op->getOperand(1);
5441
5442 // The rounding mode is in bits 23:22 of the FPCR.
5443 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5444 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5445 // ((arg - 1) & 3) << 22).
5446 //
5447 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5448 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5449 // generated llvm.set.rounding to ensure this condition.
5450
5451 // Calculate new value of FPCR[23:22].
5452 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5453 DAG.getConstant(1, DL, MVT::i32));
5454 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5455 DAG.getConstant(0x3, DL, MVT::i32));
5456 RMValue =
5457 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5458 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5459 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5460
5461 // Get current value of FPCR.
5462 SDValue Ops[] = {
5463 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5464 SDValue FPCR =
5465 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5466 Chain = FPCR.getValue(1);
5467 FPCR = FPCR.getValue(0);
5468
5469 // Put new rounding mode into FPSCR[23:22].
5470 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5471 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5472 DAG.getConstant(RMMask, DL, MVT::i64));
5473 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5474 SDValue Ops2[] = {
5475 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5476 FPCR};
5477 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5478}
5479
5480SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5481 SelectionDAG &DAG) const {
5482 SDLoc DL(Op);
5483 SDValue Chain = Op->getOperand(0);
5484
5485 // Get current value of FPCR.
5486 SDValue Ops[] = {
5487 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5488 SDValue FPCR =
5489 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5490 Chain = FPCR.getValue(1);
5491 FPCR = FPCR.getValue(0);
5492
5493 // Truncate FPCR to 32 bits.
5494 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5495
5496 return DAG.getMergeValues({Result, Chain}, DL);
5497}
5498
5499SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5500 SelectionDAG &DAG) const {
5501 SDLoc DL(Op);
5502 SDValue Chain = Op->getOperand(0);
5503 SDValue Mode = Op->getOperand(1);
5504
5505 // Extend the specified value to 64 bits.
5506 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5507
5508 // Set new value of FPCR.
5509 SDValue Ops2[] = {
5510 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5511 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5512}
5513
5514SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5515 SelectionDAG &DAG) const {
5516 SDLoc DL(Op);
5517 SDValue Chain = Op->getOperand(0);
5518
5519 // Get current value of FPCR.
5520 SDValue Ops[] = {
5521 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5522 SDValue FPCR =
5523 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5524 Chain = FPCR.getValue(1);
5525 FPCR = FPCR.getValue(0);
5526
5527 // Clear bits that are not reserved.
5528 SDValue FPSCRMasked = DAG.getNode(
5529 ISD::AND, DL, MVT::i64, FPCR,
5531
5532 // Set new value of FPCR.
5533 SDValue Ops2[] = {Chain,
5534 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5535 FPSCRMasked};
5536 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5537}
5538
5539static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5540 SDLoc DL, bool &IsMLA) {
5541 bool IsN0SExt = isSignExtended(N0, DAG);
5542 bool IsN1SExt = isSignExtended(N1, DAG);
5543 if (IsN0SExt && IsN1SExt)
5544 return AArch64ISD::SMULL;
5545
5546 bool IsN0ZExt = isZeroExtended(N0, DAG);
5547 bool IsN1ZExt = isZeroExtended(N1, DAG);
5548
5549 if (IsN0ZExt && IsN1ZExt)
5550 return AArch64ISD::UMULL;
5551
5552 // Select UMULL if we can replace the other operand with an extend.
5553 EVT VT = N0.getValueType();
5554 unsigned EltSize = VT.getScalarSizeInBits();
5555 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5556 if (IsN0ZExt || IsN1ZExt) {
5557 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5558 return AArch64ISD::UMULL;
5559 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5560 DAG.MaskedValueIsZero(N1, Mask)) {
5561 // For v2i64 we look more aggresively at both operands being zero, to avoid
5562 // scalarization.
5563 return AArch64ISD::UMULL;
5564 }
5565
5566 if (IsN0SExt || IsN1SExt) {
5567 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5568 return AArch64ISD::SMULL;
5569 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5570 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5571 return AArch64ISD::SMULL;
5572 }
5573
5574 if (!IsN1SExt && !IsN1ZExt)
5575 return 0;
5576
5577 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5578 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5579 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5580 IsMLA = true;
5581 return AArch64ISD::SMULL;
5582 }
5583 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5584 IsMLA = true;
5585 return AArch64ISD::UMULL;
5586 }
5587 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5588 std::swap(N0, N1);
5589 IsMLA = true;
5590 return AArch64ISD::UMULL;
5591 }
5592 return 0;
5593}
5594
5595SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5596 EVT VT = Op.getValueType();
5597
5598 bool OverrideNEON = !Subtarget->isNeonAvailable();
5599 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5600 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5601
5602 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5603 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5604 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5605 "unexpected type for custom-lowering ISD::MUL");
5606 SDValue N0 = Op.getOperand(0);
5607 SDValue N1 = Op.getOperand(1);
5608 bool isMLA = false;
5609 EVT OVT = VT;
5610 if (VT.is64BitVector()) {
5611 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5612 isNullConstant(N0.getOperand(1)) &&
5614 isNullConstant(N1.getOperand(1))) {
5615 N0 = N0.getOperand(0);
5616 N1 = N1.getOperand(0);
5617 VT = N0.getValueType();
5618 } else {
5619 if (VT == MVT::v1i64) {
5620 if (Subtarget->hasSVE())
5621 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5622 // Fall through to expand this. It is not legal.
5623 return SDValue();
5624 } else
5625 // Other vector multiplications are legal.
5626 return Op;
5627 }
5628 }
5629
5630 SDLoc DL(Op);
5631 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5632
5633 if (!NewOpc) {
5634 if (VT.getVectorElementType() == MVT::i64) {
5635 // If SVE is available then i64 vector multiplications can also be made
5636 // legal.
5637 if (Subtarget->hasSVE())
5638 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5639 // Fall through to expand this. It is not legal.
5640 return SDValue();
5641 } else
5642 // Other vector multiplications are legal.
5643 return Op;
5644 }
5645
5646 // Legalize to a S/UMULL instruction
5647 SDValue Op0;
5648 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5649 if (!isMLA) {
5650 Op0 = skipExtensionForVectorMULL(N0, DAG);
5652 Op1.getValueType().is64BitVector() &&
5653 "unexpected types for extended operands to VMULL");
5654 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5655 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5656 DAG.getConstant(0, DL, MVT::i64));
5657 }
5658 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5659 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5660 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5663 EVT Op1VT = Op1.getValueType();
5664 return DAG.getNode(
5666 DAG.getNode(N0.getOpcode(), DL, VT,
5667 DAG.getNode(NewOpc, DL, VT,
5668 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5669 DAG.getNode(NewOpc, DL, VT,
5670 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5671 DAG.getConstant(0, DL, MVT::i64));
5672}
5673
5674static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5675 int Pattern) {
5676 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5677 return DAG.getConstant(1, DL, MVT::nxv1i1);
5678 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5679 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5680}
5681
5683 bool IsSigned, bool IsEqual) {
5684 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5685 !isa<ConstantSDNode>(Op.getOperand(2)))
5686 return SDValue();
5687
5688 SDLoc dl(Op);
5689 APInt X = Op.getConstantOperandAPInt(1);
5690 APInt Y = Op.getConstantOperandAPInt(2);
5691
5692 // When the second operand is the maximum value, comparisons that include
5693 // equality can never fail and thus we can return an all active predicate.
5694 if (IsEqual)
5695 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5696 return DAG.getConstant(1, dl, Op.getValueType());
5697
5698 bool Overflow;
5699 APInt NumActiveElems =
5700 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5701
5702 if (Overflow)
5703 return SDValue();
5704
5705 if (IsEqual) {
5706 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5707 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5708 : NumActiveElems.uadd_ov(One, Overflow);
5709 if (Overflow)
5710 return SDValue();
5711 }
5712
5713 std::optional<unsigned> PredPattern =
5715 unsigned MinSVEVectorSize = std::max(
5717 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5718 if (PredPattern != std::nullopt &&
5719 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5720 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5721
5722 return SDValue();
5723}
5724
5725// Returns a safe bitcast between two scalable vector predicates, where
5726// any newly created lanes from a widening bitcast are defined as zero.
5728 SDLoc DL(Op);
5729 EVT InVT = Op.getValueType();
5730
5731 assert(InVT.getVectorElementType() == MVT::i1 &&
5732 VT.getVectorElementType() == MVT::i1 &&
5733 "Expected a predicate-to-predicate bitcast");
5735 InVT.isScalableVector() &&
5736 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5737 "Only expect to cast between legal scalable predicate types!");
5738
5739 // Return the operand if the cast isn't changing type,
5740 if (InVT == VT)
5741 return Op;
5742
5743 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5744 // than VT. This will increase the chances of removing casts that introduce
5745 // new lanes, which have to be explicitly zero'd.
5746 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5747 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5748 Op.getOperand(1).getValueType().bitsGT(VT))
5749 Op = Op.getOperand(1);
5750
5751 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5752
5753 // We only have to zero the lanes if new lanes are being defined, e.g. when
5754 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5755 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5756 // we can return here.
5757 if (InVT.bitsGT(VT))
5758 return Reinterpret;
5759
5760 // Check if the other lanes are already known to be zeroed by
5761 // construction.
5763 return Reinterpret;
5764
5765 // Zero the newly introduced lanes.
5766 SDValue Mask = DAG.getConstant(1, DL, InVT);
5767 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5768 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5769}
5770
5771SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5772 SDValue Chain, SDLoc DL,
5773 EVT VT) const {
5774 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5776 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5777 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5780 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5782 RetTy, Callee, std::move(Args));
5783 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5784 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5785 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5786 Mask);
5787}
5788
5789// Lower an SME LDR/STR ZA intrinsic
5790// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5791// folded into the instruction
5792// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5793// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5794// and tile slice registers
5795// ldr(%tileslice, %ptr, %vecnum)
5796// ->
5797// %svl = rdsvl
5798// %ptr2 = %ptr + %svl * %vecnum
5799// %tileslice2 = %tileslice + %vecnum
5800// ldr [%tileslice2, 0], [%ptr2, 0]
5801// Case 3: If the vecnum is an immediate out of range, then the same is done as
5802// case 2, but the base and slice registers are modified by the greatest
5803// multiple of 15 lower than the vecnum and the remainder is folded into the
5804// instruction. This means that successive loads and stores that are offset from
5805// each other can share the same base and slice register updates.
5806// ldr(%tileslice, %ptr, 22)
5807// ldr(%tileslice, %ptr, 23)
5808// ->
5809// %svl = rdsvl
5810// %ptr2 = %ptr + %svl * 15
5811// %tileslice2 = %tileslice + 15
5812// ldr [%tileslice2, 7], [%ptr2, 7]
5813// ldr [%tileslice2, 8], [%ptr2, 8]
5814// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5815// operand and the immediate can be folded into the instruction, like case 2.
5816// ldr(%tileslice, %ptr, %vecnum + 7)
5817// ldr(%tileslice, %ptr, %vecnum + 8)
5818// ->
5819// %svl = rdsvl
5820// %ptr2 = %ptr + %svl * %vecnum
5821// %tileslice2 = %tileslice + %vecnum
5822// ldr [%tileslice2, 7], [%ptr2, 7]
5823// ldr [%tileslice2, 8], [%ptr2, 8]
5824// Case 5: The vecnum being an add of an immediate out of range is also handled,
5825// in which case the same remainder logic as case 3 is used.
5827 SDLoc DL(N);
5828
5829 SDValue TileSlice = N->getOperand(2);
5830 SDValue Base = N->getOperand(3);
5831 SDValue VecNum = N->getOperand(4);
5832 int32_t ConstAddend = 0;
5833 SDValue VarAddend = VecNum;
5834
5835 // If the vnum is an add of an immediate, we can fold it into the instruction
5836 if (VecNum.getOpcode() == ISD::ADD &&
5837 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5838 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5839 VarAddend = VecNum.getOperand(0);
5840 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5841 ConstAddend = ImmNode->getSExtValue();
5842 VarAddend = SDValue();
5843 }
5844
5845 int32_t ImmAddend = ConstAddend % 16;
5846 if (int32_t C = (ConstAddend - ImmAddend)) {
5847 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5848 VarAddend = VarAddend
5849 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5850 : CVal;
5851 }
5852
5853 if (VarAddend) {
5854 // Get the vector length that will be multiplied by vnum
5855 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5856 DAG.getConstant(1, DL, MVT::i32));
5857
5858 // Multiply SVL and vnum then add it to the base
5859 SDValue Mul = DAG.getNode(
5860 ISD::MUL, DL, MVT::i64,
5861 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5862 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5863 // Just add vnum to the tileslice
5864 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5865 }
5866
5868 DL, MVT::Other,
5869 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5870 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5871}
5872
5874 SDLoc dl(Op);
5875 SDValue ID =
5876 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
5877
5878 auto Op1 = Op.getOperand(1);
5879 auto Op2 = Op.getOperand(2);
5880 auto Mask = Op.getOperand(3);
5881
5882 EVT Op1VT = Op1.getValueType();
5883 EVT Op2VT = Op2.getValueType();
5884 EVT ResVT = Op.getValueType();
5885
5886 assert((Op1VT.getVectorElementType() == MVT::i8 ||
5887 Op1VT.getVectorElementType() == MVT::i16) &&
5888 "Expected 8-bit or 16-bit characters.");
5889
5890 // Scalable vector type used to wrap operands.
5891 // A single container is enough for both operands because ultimately the
5892 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5893 EVT OpContainerVT = Op1VT.isScalableVector()
5894 ? Op1VT
5896
5897 if (Op2VT.is128BitVector()) {
5898 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5899 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
5900 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
5901 if (ResVT.isScalableVector())
5902 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
5903 DAG.getTargetConstant(0, dl, MVT::i64));
5904 } else {
5905 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
5906 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5907 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
5908 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
5909 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
5910 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op2IntVT, Op2,
5911 DAG.getConstant(0, dl, MVT::i64));
5912 Op2 = DAG.getSplatVector(Op2PromotedVT, dl, Op2);
5913 Op2 = DAG.getBitcast(OpContainerVT, Op2);
5914 }
5915
5916 // If the result is scalable, we just need to carry out the MATCH.
5917 if (ResVT.isScalableVector())
5918 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1, Op2);
5919
5920 // If the result is fixed, we can still use MATCH but we need to wrap the
5921 // first operand and the mask in scalable vectors before doing so.
5922
5923 // Wrap the operands.
5924 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
5925 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, Op1VT, Mask);
5926 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5927
5928 // Carry out the match.
5929 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Mask.getValueType(),
5930 ID, Mask, Op1, Op2);
5931
5932 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5933 // (v16i8/v8i8).
5934 Match = DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match);
5935 Match = convertFromScalableVector(DAG, Op1VT, Match);
5936 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Match);
5937}
5938
5939SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5940 SelectionDAG &DAG) const {
5941 unsigned IntNo = Op.getConstantOperandVal(1);
5942 SDLoc DL(Op);
5943 switch (IntNo) {
5944 default:
5945 return SDValue(); // Don't custom lower most intrinsics.
5946 case Intrinsic::aarch64_prefetch: {
5947 SDValue Chain = Op.getOperand(0);
5948 SDValue Addr = Op.getOperand(2);
5949
5950 unsigned IsWrite = Op.getConstantOperandVal(3);
5951 unsigned Locality = Op.getConstantOperandVal(4);
5952 unsigned IsStream = Op.getConstantOperandVal(5);
5953 unsigned IsData = Op.getConstantOperandVal(6);
5954 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5955 (!IsData << 3) | // IsDataCache bit
5956 (Locality << 1) | // Cache level bits
5957 (unsigned)IsStream; // Stream bit
5958
5959 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5960 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5961 }
5962 case Intrinsic::aarch64_sme_str:
5963 case Intrinsic::aarch64_sme_ldr: {
5964 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5965 }
5966 case Intrinsic::aarch64_sme_za_enable:
5967 return DAG.getNode(
5968 AArch64ISD::SMSTART, DL, MVT::Other,
5969 Op->getOperand(0), // Chain
5970 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5971 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5972 case Intrinsic::aarch64_sme_za_disable:
5973 return DAG.getNode(
5974 AArch64ISD::SMSTOP, DL, MVT::Other,
5975 Op->getOperand(0), // Chain
5976 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5977 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5978 }
5979}
5980
5981SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5982 SelectionDAG &DAG) const {
5983 unsigned IntNo = Op.getConstantOperandVal(1);
5984 SDLoc DL(Op);
5985 switch (IntNo) {
5986 default:
5987 return SDValue(); // Don't custom lower most intrinsics.
5988 case Intrinsic::aarch64_mops_memset_tag: {
5989 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5990 SDValue Chain = Node->getChain();
5991 SDValue Dst = Op.getOperand(2);
5992 SDValue Val = Op.getOperand(3);
5993 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5994 SDValue Size = Op.getOperand(4);
5995 auto Alignment = Node->getMemOperand()->getAlign();
5996 bool IsVol = Node->isVolatile();
5997 auto DstPtrInfo = Node->getPointerInfo();
5998
5999 const auto &SDI =
6000 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6001 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6002 Chain, Dst, Val, Size, Alignment, IsVol,
6003 DstPtrInfo, MachinePointerInfo{});
6004
6005 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6006 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6007 // LowerOperationWrapper will complain that the number of results has
6008 // changed.
6009 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6010 }
6011 }
6012}
6013
6014SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6015 SelectionDAG &DAG) const {
6016 unsigned IntNo = Op.getConstantOperandVal(0);
6017 SDLoc dl(Op);
6018 switch (IntNo) {
6019 default: return SDValue(); // Don't custom lower most intrinsics.
6020 case Intrinsic::thread_pointer: {
6021 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6022 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
6023 }
6024 case Intrinsic::aarch64_neon_abs: {
6025 EVT Ty = Op.getValueType();
6026 if (Ty == MVT::i64) {
6027 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
6028 Op.getOperand(1));
6029 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
6030 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
6031 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6032 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
6033 } else {
6034 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
6035 }
6036 }
6037 case Intrinsic::aarch64_neon_pmull64: {
6038 SDValue LHS = Op.getOperand(1);
6039 SDValue RHS = Op.getOperand(2);
6040
6041 std::optional<uint64_t> LHSLane =
6043 std::optional<uint64_t> RHSLane =
6045
6046 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6047 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6048
6049 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6050 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6051 // which ISel recognizes better. For example, generate a ldr into d*
6052 // registers as opposed to a GPR load followed by a fmov.
6053 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6054 std::optional<uint64_t> OtherLane,
6055 const SDLoc &dl,
6056 SelectionDAG &DAG) -> SDValue {
6057 // If the operand is an higher half itself, rewrite it to
6058 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6059 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6060 if (NLane && *NLane == 1)
6061 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6062 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
6063
6064 // Operand N is not a higher half but the other operand is.
6065 if (OtherLane && *OtherLane == 1) {
6066 // If this operand is a lower half, rewrite it to
6067 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6068 // align lanes of two operands. A roundtrip sequence (to move from lane
6069 // 1 to lane 0) is like this:
6070 // mov x8, v0.d[1]
6071 // fmov d0, x8
6072 if (NLane && *NLane == 0)
6073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6074 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
6075 N.getOperand(0),
6076 DAG.getConstant(0, dl, MVT::i64)),
6077 DAG.getConstant(1, dl, MVT::i64));
6078
6079 // Otherwise just dup from main to all lanes.
6080 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
6081 }
6082
6083 // Neither operand is an extract of higher half, so codegen may just use
6084 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6085 assert(N.getValueType() == MVT::i64 &&
6086 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6087 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
6088 };
6089
6090 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
6091 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
6092
6093 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
6094 }
6095 case Intrinsic::aarch64_neon_smax:
6096 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
6097 Op.getOperand(1), Op.getOperand(2));
6098 case Intrinsic::aarch64_neon_umax:
6099 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
6100 Op.getOperand(1), Op.getOperand(2));
6101 case Intrinsic::aarch64_neon_smin:
6102 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
6103 Op.getOperand(1), Op.getOperand(2));
6104 case Intrinsic::aarch64_neon_umin:
6105 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
6106 Op.getOperand(1), Op.getOperand(2));
6107 case Intrinsic::aarch64_neon_scalar_sqxtn:
6108 case Intrinsic::aarch64_neon_scalar_sqxtun:
6109 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6110 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6111 if (Op.getValueType() == MVT::i32)
6112 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6113 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
6114 Op.getOperand(0),
6115 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
6116 Op.getOperand(1))));
6117 return SDValue();
6118 }
6119 case Intrinsic::aarch64_neon_sqxtn:
6120 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6121 Op.getOperand(1));
6122 case Intrinsic::aarch64_neon_sqxtun:
6123 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6124 Op.getOperand(1));
6125 case Intrinsic::aarch64_neon_uqxtn:
6126 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6127 Op.getOperand(1));
6128 case Intrinsic::aarch64_neon_sqshrn:
6129 if (Op.getValueType().isVector())
6130 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6131 DAG.getNode(AArch64ISD::VASHR, dl,
6132 Op.getOperand(1).getValueType(),
6133 Op.getOperand(1), Op.getOperand(2)));
6134 return SDValue();
6135 case Intrinsic::aarch64_neon_sqshrun:
6136 if (Op.getValueType().isVector())
6137 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6138 DAG.getNode(AArch64ISD::VASHR, dl,
6139 Op.getOperand(1).getValueType(),
6140 Op.getOperand(1), Op.getOperand(2)));
6141 return SDValue();
6142 case Intrinsic::aarch64_neon_uqshrn:
6143 if (Op.getValueType().isVector())
6144 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6145 DAG.getNode(AArch64ISD::VLSHR, dl,
6146 Op.getOperand(1).getValueType(),
6147 Op.getOperand(1), Op.getOperand(2)));
6148 return SDValue();
6149 case Intrinsic::aarch64_neon_sqrshrn:
6150 if (Op.getValueType().isVector())
6151 return DAG.getNode(
6152 ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6153 DAG.getNode(
6154 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6155 Op.getOperand(1), Op.getOperand(2)));
6156 return SDValue();
6157 case Intrinsic::aarch64_neon_sqrshrun:
6158 if (Op.getValueType().isVector())
6159 return DAG.getNode(
6160 ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6161 DAG.getNode(
6162 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6163 Op.getOperand(1), Op.getOperand(2)));
6164 return SDValue();
6165 case Intrinsic::aarch64_neon_uqrshrn:
6166 if (Op.getValueType().isVector())
6167 return DAG.getNode(
6168 ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6169 DAG.getNode(
6170 AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2)));
6171 return SDValue();
6172 case Intrinsic::aarch64_sve_whilelo:
6173 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6174 /*IsEqual=*/false);
6175 case Intrinsic::aarch64_sve_whilelt:
6176 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6177 /*IsEqual=*/false);
6178 case Intrinsic::aarch64_sve_whilels:
6179 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6180 /*IsEqual=*/true);
6181 case Intrinsic::aarch64_sve_whilele:
6182 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6183 /*IsEqual=*/true);
6184 case Intrinsic::aarch64_sve_sunpkhi:
6185 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
6186 Op.getOperand(1));
6187 case Intrinsic::aarch64_sve_sunpklo:
6188 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
6189 Op.getOperand(1));
6190 case Intrinsic::aarch64_sve_uunpkhi:
6191 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
6192 Op.getOperand(1));
6193 case Intrinsic::aarch64_sve_uunpklo:
6194 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
6195 Op.getOperand(1));
6196 case Intrinsic::aarch64_sve_clasta_n:
6197 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
6198 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6199 case Intrinsic::aarch64_sve_clastb_n:
6200 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
6201 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6202 case Intrinsic::aarch64_sve_lasta:
6203 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
6204 Op.getOperand(1), Op.getOperand(2));
6205 case Intrinsic::aarch64_sve_lastb:
6206 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
6207 Op.getOperand(1), Op.getOperand(2));
6208 case Intrinsic::aarch64_sve_rev:
6209 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
6210 Op.getOperand(1));
6211 case Intrinsic::aarch64_sve_tbl:
6212 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
6213 Op.getOperand(1), Op.getOperand(2));
6214 case Intrinsic::aarch64_sve_trn1:
6215 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
6216 Op.getOperand(1), Op.getOperand(2));
6217 case Intrinsic::aarch64_sve_trn2:
6218 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
6219 Op.getOperand(1), Op.getOperand(2));
6220 case Intrinsic::aarch64_sve_uzp1:
6221 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
6222 Op.getOperand(1), Op.getOperand(2));
6223 case Intrinsic::aarch64_sve_uzp2:
6224 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
6225 Op.getOperand(1), Op.getOperand(2));
6226 case Intrinsic::aarch64_sve_zip1:
6227 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
6228 Op.getOperand(1), Op.getOperand(2));
6229 case Intrinsic::aarch64_sve_zip2:
6230 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
6231 Op.getOperand(1), Op.getOperand(2));
6232 case Intrinsic::aarch64_sve_splice:
6233 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
6234 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6235 case Intrinsic::aarch64_sve_ptrue:
6236 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
6237 case Intrinsic::aarch64_sve_clz:
6238 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
6239 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6240 case Intrinsic::aarch64_sme_cntsb:
6241 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6242 DAG.getConstant(1, dl, MVT::i32));
6243 case Intrinsic::aarch64_sme_cntsh: {
6244 SDValue One = DAG.getConstant(1, dl, MVT::i32);
6245 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
6246 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
6247 }
6248 case Intrinsic::aarch64_sme_cntsw: {
6249 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6250 DAG.getConstant(1, dl, MVT::i32));
6251 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6252 DAG.getConstant(2, dl, MVT::i32));
6253 }
6254 case Intrinsic::aarch64_sme_cntsd: {
6255 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6256 DAG.getConstant(1, dl, MVT::i32));
6257 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6258 DAG.getConstant(3, dl, MVT::i32));
6259 }
6260 case Intrinsic::aarch64_sve_cnt: {
6261 SDValue Data = Op.getOperand(3);
6262 // CTPOP only supports integer operands.
6263 if (Data.getValueType().isFloatingPoint())
6264 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
6265 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
6266 Op.getOperand(2), Data, Op.getOperand(1));
6267 }
6268 case Intrinsic::aarch64_sve_dupq_lane:
6269 return LowerDUPQLane(Op, DAG);
6270 case Intrinsic::aarch64_sve_convert_from_svbool:
6271 if (Op.getValueType() == MVT::aarch64svcount)
6272 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
6273 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6274 case Intrinsic::aarch64_sve_convert_to_svbool:
6275 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6276 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
6277 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6278 case Intrinsic::aarch64_sve_fneg:
6279 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6280 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6281 case Intrinsic::aarch64_sve_frintp:
6282 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
6283 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6284 case Intrinsic::aarch64_sve_frintm:
6285 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
6286 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6287 case Intrinsic::aarch64_sve_frinti:
6288 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6289 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6290 case Intrinsic::aarch64_sve_frintx:
6291 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6292 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6293 case Intrinsic::aarch64_sve_frinta:
6294 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
6295 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6296 case Intrinsic::aarch64_sve_frintn:
6297 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
6298 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6299 case Intrinsic::aarch64_sve_frintz:
6300 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
6301 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6302 case Intrinsic::aarch64_sve_ucvtf:
6304 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6305 Op.getOperand(1));
6306 case Intrinsic::aarch64_sve_scvtf:
6308 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6309 Op.getOperand(1));
6310 case Intrinsic::aarch64_sve_fcvtzu:
6312 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6313 Op.getOperand(1));
6314 case Intrinsic::aarch64_sve_fcvtzs:
6316 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6317 Op.getOperand(1));
6318 case Intrinsic::aarch64_sve_fsqrt:
6319 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
6320 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6321 case Intrinsic::aarch64_sve_frecpx:
6322 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
6323 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6324 case Intrinsic::aarch64_sve_frecpe_x:
6325 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
6326 Op.getOperand(1));
6327 case Intrinsic::aarch64_sve_frecps_x:
6328 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
6329 Op.getOperand(1), Op.getOperand(2));
6330 case Intrinsic::aarch64_sve_frsqrte_x:
6331 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
6332 Op.getOperand(1));
6333 case Intrinsic::aarch64_sve_frsqrts_x:
6334 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
6335 Op.getOperand(1), Op.getOperand(2));
6336 case Intrinsic::aarch64_sve_fabs:
6337 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6338 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6339 case Intrinsic::aarch64_sve_abs:
6340 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6341 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6342 case Intrinsic::aarch64_sve_neg:
6343 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6344 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6345 case Intrinsic::aarch64_sve_insr: {
6346 SDValue Scalar = Op.getOperand(2);
6347 EVT ScalarTy = Scalar.getValueType();
6348 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6349 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
6350
6351 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
6352 Op.getOperand(1), Scalar);
6353 }
6354 case Intrinsic::aarch64_sve_rbit:
6356 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6357 Op.getOperand(1));
6358 case Intrinsic::aarch64_sve_revb:
6359 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
6360 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6361 case Intrinsic::aarch64_sve_revh:
6362 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
6363 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6364 case Intrinsic::aarch64_sve_revw:
6365 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
6366 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6367 case Intrinsic::aarch64_sve_revd:
6368 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
6369 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6370 case Intrinsic::aarch64_sve_sxtb:
6371 return DAG.getNode(
6373 Op.getOperand(2), Op.getOperand(3),
6374 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6375 Op.getOperand(1));
6376 case Intrinsic::aarch64_sve_sxth:
6377 return DAG.getNode(
6379 Op.getOperand(2), Op.getOperand(3),
6380 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6381 Op.getOperand(1));
6382 case Intrinsic::aarch64_sve_sxtw:
6383 return DAG.getNode(
6385 Op.getOperand(2), Op.getOperand(3),
6386 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6387 Op.getOperand(1));
6388 case Intrinsic::aarch64_sve_uxtb:
6389 return DAG.getNode(
6391 Op.getOperand(2), Op.getOperand(3),
6392 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6393 Op.getOperand(1));
6394 case Intrinsic::aarch64_sve_uxth:
6395 return DAG.getNode(
6397 Op.getOperand(2), Op.getOperand(3),
6398 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6399 Op.getOperand(1));
6400 case Intrinsic::aarch64_sve_uxtw:
6401 return DAG.getNode(
6403 Op.getOperand(2), Op.getOperand(3),
6404 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6405 Op.getOperand(1));
6406 case Intrinsic::localaddress: {
6407 const auto &MF = DAG.getMachineFunction();
6408 const auto *RegInfo = Subtarget->getRegisterInfo();
6409 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6410 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
6411 Op.getSimpleValueType());
6412 }
6413
6414 case Intrinsic::eh_recoverfp: {
6415 // FIXME: This needs to be implemented to correctly handle highly aligned
6416 // stack objects. For now we simply return the incoming FP. Refer D53541
6417 // for more details.
6418 SDValue FnOp = Op.getOperand(1);
6419 SDValue IncomingFPOp = Op.getOperand(2);
6420 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6421 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6422 if (!Fn)
6424 "llvm.eh.recoverfp must take a function as the first argument");
6425 return IncomingFPOp;
6426 }
6427
6428 case Intrinsic::aarch64_neon_vsri:
6429 case Intrinsic::aarch64_neon_vsli:
6430 case Intrinsic::aarch64_sve_sri:
6431 case Intrinsic::aarch64_sve_sli: {
6432 EVT Ty = Op.getValueType();
6433
6434 if (!Ty.isVector())
6435 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6436
6437 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6438
6439 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6440 IntNo == Intrinsic::aarch64_sve_sri;
6441 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6442 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
6443 Op.getOperand(3));
6444 }
6445
6446 case Intrinsic::aarch64_neon_srhadd:
6447 case Intrinsic::aarch64_neon_urhadd:
6448 case Intrinsic::aarch64_neon_shadd:
6449 case Intrinsic::aarch64_neon_uhadd: {
6450 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6451 IntNo == Intrinsic::aarch64_neon_shadd);
6452 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6453 IntNo == Intrinsic::aarch64_neon_urhadd);
6454 unsigned Opcode = IsSignedAdd
6455 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6456 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6457 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6458 Op.getOperand(2));
6459 }
6460 case Intrinsic::aarch64_neon_saddlp:
6461 case Intrinsic::aarch64_neon_uaddlp: {
6462 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6465 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
6466 }
6467 case Intrinsic::aarch64_neon_sdot:
6468 case Intrinsic::aarch64_neon_udot:
6469 case Intrinsic::aarch64_sve_sdot:
6470 case Intrinsic::aarch64_sve_udot: {
6471 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6472 IntNo == Intrinsic::aarch64_sve_udot)
6475 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6476 Op.getOperand(2), Op.getOperand(3));
6477 }
6478 case Intrinsic::aarch64_neon_usdot:
6479 case Intrinsic::aarch64_sve_usdot: {
6480 return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
6481 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6482 }
6483 case Intrinsic::get_active_lane_mask: {
6484 SDValue ID =
6485 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
6486
6487 EVT VT = Op.getValueType();
6488 if (VT.isScalableVector())
6489 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6490 Op.getOperand(2));
6491
6492 // We can use the SVE whilelo instruction to lower this intrinsic by
6493 // creating the appropriate sequence of scalable vector operations and
6494 // then extracting a fixed-width subvector from the scalable vector.
6495
6496 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6497 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6498
6499 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6500 Op.getOperand(1), Op.getOperand(2));
6501 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6502 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6503 DAG.getVectorIdxConstant(0, dl));
6504 }
6505 case Intrinsic::aarch64_neon_saddlv:
6506 case Intrinsic::aarch64_neon_uaddlv: {
6507 EVT OpVT = Op.getOperand(1).getValueType();
6508 EVT ResVT = Op.getValueType();
6509 assert(
6510 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6511 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6512 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6513 "Unexpected aarch64_neon_u/saddlv type");
6514 (void)OpVT;
6515 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6516 SDValue ADDLV = DAG.getNode(
6517 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6519 dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6520 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6521 ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6522 ADDLV, DAG.getConstant(0, dl, MVT::i64));
6523 return EXTRACT_VEC_ELT;
6524 }
6525 case Intrinsic::experimental_cttz_elts: {
6526 SDValue CttzOp = Op.getOperand(1);
6527 EVT VT = CttzOp.getValueType();
6528 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6529
6530 if (VT.isFixedLengthVector()) {
6531 // We can use SVE instructions to lower this intrinsic by first creating
6532 // an SVE predicate register mask from the fixed-width vector.
6533 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6534 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6535 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6536 }
6537
6538 SDValue NewCttzElts =
6539 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6540 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6541 }
6542 case Intrinsic::experimental_vector_match: {
6543 return LowerVectorMatch(Op, DAG);
6544 }
6545 }
6546}
6547
6548bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6549 if (VT.getVectorElementType() == MVT::i8 ||
6550 VT.getVectorElementType() == MVT::i16) {
6551 EltTy = MVT::i32;
6552 return true;
6553 }
6554 return false;
6555}
6556
6557bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6558 EVT DataVT) const {
6559 const EVT IndexVT = Extend.getOperand(0).getValueType();
6560 // SVE only supports implicit extension of 32-bit indices.
6561 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6562 return false;
6563
6564 // Indices cannot be smaller than the main data type.
6565 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6566 return false;
6567
6568 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6569 // element container type, which would violate the previous clause.
6570 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6571}
6572
6573bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6574 EVT ExtVT = ExtVal.getValueType();
6575 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6576 return false;
6577
6578 // It may be worth creating extending masked loads if there are multiple
6579 // masked loads using the same predicate. That way we'll end up creating
6580 // extending masked loads that may then get split by the legaliser. This
6581 // results in just one set of predicate unpacks at the start, instead of
6582 // multiple sets of vector unpacks after each load.
6583 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6584 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6585 // Disable extending masked loads for fixed-width for now, since the code
6586 // quality doesn't look great.
6587 if (!ExtVT.isScalableVector())
6588 return false;
6589
6590 unsigned NumExtMaskedLoads = 0;
6591 for (auto *U : Ld->getMask()->users())
6592 if (isa<MaskedLoadSDNode>(U))
6593 NumExtMaskedLoads++;
6594
6595 if (NumExtMaskedLoads <= 1)
6596 return false;
6597 }
6598 }
6599
6600 return true;
6601}
6602
6603unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6604 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6605 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6607 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6609 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6611 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6613 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6615 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6617 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6619 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6621 };
6622 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6623 return AddrModes.find(Key)->second;
6624}
6625
6626unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6627 switch (Opcode) {
6628 default:
6629 llvm_unreachable("unimplemented opcode");
6630 return Opcode;
6645 }
6646}
6647
6648SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6649 SelectionDAG &DAG) const {
6650 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6651
6652 SDLoc DL(Op);
6653 SDValue Chain = MGT->getChain();
6654 SDValue PassThru = MGT->getPassThru();
6655 SDValue Mask = MGT->getMask();
6656 SDValue BasePtr = MGT->getBasePtr();
6657 SDValue Index = MGT->getIndex();
6658 SDValue Scale = MGT->getScale();
6659 EVT VT = Op.getValueType();
6660 EVT MemVT = MGT->getMemoryVT();
6661 ISD::LoadExtType ExtType = MGT->getExtensionType();
6662 ISD::MemIndexType IndexType = MGT->getIndexType();
6663
6664 // SVE supports zero (and so undef) passthrough values only, everything else
6665 // must be handled manually by an explicit select on the load's output.
6666 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6667 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6668 SDValue Load =
6669 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6670 MGT->getMemOperand(), IndexType, ExtType);
6671 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6672 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6673 }
6674
6675 bool IsScaled = MGT->isIndexScaled();
6676 bool IsSigned = MGT->isIndexSigned();
6677
6678 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6679 // must be calculated before hand.
6680 uint64_t ScaleVal = Scale->getAsZExtVal();
6681 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6682 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6683 EVT IndexVT = Index.getValueType();
6684 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6685 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6686 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6687
6688 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6689 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6690 MGT->getMemOperand(), IndexType, ExtType);
6691 }
6692
6693 // Lower fixed length gather to a scalable equivalent.
6694 if (VT.isFixedLengthVector()) {
6695 assert(Subtarget->useSVEForFixedLengthVectors() &&
6696 "Cannot lower when not using SVE for fixed vectors!");
6697
6698 // NOTE: Handle floating-point as if integer then bitcast the result.
6700 MemVT = MemVT.changeVectorElementTypeToInteger();
6701
6702 // Find the smallest integer fixed length vector we can use for the gather.
6703 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6704 if (DataVT.getVectorElementType() == MVT::i64 ||
6705 Index.getValueType().getVectorElementType() == MVT::i64 ||
6706 Mask.getValueType().getVectorElementType() == MVT::i64)
6707 PromotedVT = VT.changeVectorElementType(MVT::i64);
6708
6709 // Promote vector operands except for passthrough, which we know is either
6710 // undef or zero, and thus best constructed directly.
6711 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6712 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6713 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6714
6715 // A promoted result type forces the need for an extending load.
6716 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6717 ExtType = ISD::EXTLOAD;
6718
6719 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6720
6721 // Convert fixed length vector operands to scalable.
6722 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6723 Index = convertToScalableVector(DAG, ContainerVT, Index);
6725 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6726 : DAG.getConstant(0, DL, ContainerVT);
6727
6728 // Emit equivalent scalable vector gather.
6729 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6730 SDValue Load =
6731 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6732 Ops, MGT->getMemOperand(), IndexType, ExtType);
6733
6734 // Extract fixed length data then convert to the required result type.
6735 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6736 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6737 if (VT.isFloatingPoint())
6738 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6739
6740 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6741 }
6742
6743 // Everything else is legal.
6744 return Op;
6745}
6746
6747SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6748 SelectionDAG &DAG) const {
6749 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6750
6751 SDLoc DL(Op);
6752 SDValue Chain = MSC->getChain();
6753 SDValue StoreVal = MSC->getValue();
6754 SDValue Mask = MSC->getMask();
6755 SDValue BasePtr = MSC->getBasePtr();
6756 SDValue Index = MSC->getIndex();
6757 SDValue Scale = MSC->getScale();
6758 EVT VT = StoreVal.getValueType();
6759 EVT MemVT = MSC->getMemoryVT();
6760 ISD::MemIndexType IndexType = MSC->getIndexType();
6761 bool Truncating = MSC->isTruncatingStore();
6762
6763 bool IsScaled = MSC->isIndexScaled();
6764 bool IsSigned = MSC->isIndexSigned();
6765
6766 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6767 // must be calculated before hand.
6768 uint64_t ScaleVal = Scale->getAsZExtVal();
6769 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6770 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6771 EVT IndexVT = Index.getValueType();
6772 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6773 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6774 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6775
6776 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6777 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6778 MSC->getMemOperand(), IndexType, Truncating);
6779 }
6780
6781 // Lower fixed length scatter to a scalable equivalent.
6782 if (VT.isFixedLengthVector()) {
6783 assert(Subtarget->useSVEForFixedLengthVectors() &&
6784 "Cannot lower when not using SVE for fixed vectors!");
6785
6786 // Once bitcast we treat floating-point scatters as if integer.
6787 if (VT.isFloatingPoint()) {
6789 MemVT = MemVT.changeVectorElementTypeToInteger();
6790 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6791 }
6792
6793 // Find the smallest integer fixed length vector we can use for the scatter.
6794 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6795 if (VT.getVectorElementType() == MVT::i64 ||
6796 Index.getValueType().getVectorElementType() == MVT::i64 ||
6797 Mask.getValueType().getVectorElementType() == MVT::i64)
6798 PromotedVT = VT.changeVectorElementType(MVT::i64);
6799
6800 // Promote vector operands.
6801 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6802 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6803 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6804 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6805
6806 // A promoted value type forces the need for a truncating store.
6807 if (PromotedVT != VT)
6808 Truncating = true;
6809
6810 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6811
6812 // Convert fixed length vector operands to scalable.
6813 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6814 Index = convertToScalableVector(DAG, ContainerVT, Index);
6816 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6817
6818 // Emit equivalent scalable vector scatter.
6819 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6820 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6821 MSC->getMemOperand(), IndexType, Truncating);
6822 }
6823
6824 // Everything else is legal.
6825 return Op;
6826}
6827
6828SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6829 SDLoc DL(Op);
6830 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6831 assert(LoadNode && "Expected custom lowering of a masked load node");
6832 EVT VT = Op->getValueType(0);
6833
6834 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6835 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6836
6837 SDValue PassThru = LoadNode->getPassThru();
6838 SDValue Mask = LoadNode->getMask();
6839
6840 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6841 return Op;
6842
6844 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6845 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6846 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6847 LoadNode->getExtensionType());
6848
6849 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6850
6851 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6852}
6853
6854// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6856 EVT VT, EVT MemVT,
6857 SelectionDAG &DAG) {
6858 assert(VT.isVector() && "VT should be a vector type");
6859 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6860
6861 SDValue Value = ST->getValue();
6862
6863 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6864 // the word lane which represent the v4i8 subvector. It optimizes the store
6865 // to:
6866 //
6867 // xtn v0.8b, v0.8h
6868 // str s0, [x0]
6869
6870 SDValue Undef = DAG.getUNDEF(MVT::i16);
6871 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6872 {Undef, Undef, Undef, Undef});
6873
6874 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6875 Value, UndefVec);
6876 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6877
6878 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6879 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6880 Trunc, DAG.getConstant(0, DL, MVT::i64));
6881
6882 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6883 ST->getBasePtr(), ST->getMemOperand());
6884}
6885
6887 SDLoc dl(Op);
6888 SDValue Src = Op.getOperand(0);
6889 MVT DestVT = Op.getSimpleValueType();
6890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6891 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
6892
6893 unsigned SrcAS = N->getSrcAddressSpace();
6894 unsigned DestAS = N->getDestAddressSpace();
6895 assert(SrcAS != DestAS &&
6896 "addrspacecast must be between different address spaces");
6897 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
6898 TLI.getTargetMachine().getPointerSize(DestAS) &&
6899 "addrspacecast must be between different ptr sizes");
6900 (void)TLI;
6901
6902 if (SrcAS == ARM64AS::PTR32_SPTR) {
6903 return DAG.getNode(ISD::SIGN_EXTEND, dl, DestVT, Src,
6904 DAG.getTargetConstant(0, dl, DestVT));
6905 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
6906 return DAG.getNode(ISD::ZERO_EXTEND, dl, DestVT, Src,
6907 DAG.getTargetConstant(0, dl, DestVT));
6908 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
6909 (DestAS == ARM64AS::PTR32_UPTR)) {
6910 SDValue Ext = DAG.getAnyExtOrTrunc(Src, dl, DestVT);
6911 SDValue Trunc = DAG.getZeroExtendInReg(Ext, dl, DestVT);
6912 return Trunc;
6913 } else {
6914 return Src;
6915 }
6916}
6917
6918// Custom lowering for any store, vector or scalar and/or default or with
6919// a truncate operations. Currently only custom lower truncate operation
6920// from vector v4i16 to v4i8 or volatile stores of i128.
6921SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6922 SelectionDAG &DAG) const {
6923 SDLoc Dl(Op);
6924 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6925 assert (StoreNode && "Can only custom lower store nodes");
6926
6927 SDValue Value = StoreNode->getValue();
6928
6929 EVT VT = Value.getValueType();
6930 EVT MemVT = StoreNode->getMemoryVT();
6931
6932 if (VT.isVector()) {
6934 VT,
6935 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6936 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6937
6938 unsigned AS = StoreNode->getAddressSpace();
6939 Align Alignment = StoreNode->getAlign();
6940 if (Alignment < MemVT.getStoreSize() &&
6941 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6942 StoreNode->getMemOperand()->getFlags(),
6943 nullptr)) {
6944 return scalarizeVectorStore(StoreNode, DAG);
6945 }
6946
6947 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6948 MemVT == MVT::v4i8) {
6949 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6950 }
6951 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6952 // the custom lowering, as there are no un-paired non-temporal stores and
6953 // legalization will break up 256 bit inputs.
6955 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6956 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6957 (MemVT.getScalarSizeInBits() == 8u ||
6958 MemVT.getScalarSizeInBits() == 16u ||
6959 MemVT.getScalarSizeInBits() == 32u ||
6960 MemVT.getScalarSizeInBits() == 64u)) {
6961 SDValue Lo =
6964 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6965 SDValue Hi =
6968 StoreNode->getValue(),
6969 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6971 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6972 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6973 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6974 return Result;
6975 }
6976 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6977 return LowerStore128(Op, DAG);
6978 } else if (MemVT == MVT::i64x8) {
6979 SDValue Value = StoreNode->getValue();
6980 assert(Value->getValueType(0) == MVT::i64x8);
6981 SDValue Chain = StoreNode->getChain();
6982 SDValue Base = StoreNode->getBasePtr();
6983 EVT PtrVT = Base.getValueType();
6984 for (unsigned i = 0; i < 8; i++) {
6985 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6986 Value, DAG.getConstant(i, Dl, MVT::i32));
6987 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6988 DAG.getConstant(i * 8, Dl, PtrVT));
6989 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6990 StoreNode->getOriginalAlign());
6991 }
6992 return Chain;
6993 }
6994
6995 return SDValue();
6996}
6997
6998/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6999SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7000 SelectionDAG &DAG) const {
7001 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7002 assert(StoreNode->getMemoryVT() == MVT::i128);
7003 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7004
7005 bool IsStoreRelease =
7007 if (StoreNode->isAtomic())
7008 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7009 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7012
7013 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7014 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7015 ? StoreNode->getOperand(1)
7016 : StoreNode->getOperand(2);
7017 SDLoc DL(Op);
7018 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7019 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7020 if (DAG.getDataLayout().isBigEndian())
7021 std::swap(StoreValue.first, StoreValue.second);
7023 Opcode, DL, DAG.getVTList(MVT::Other),
7024 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7025 StoreNode->getBasePtr()},
7026 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7027 return Result;
7028}
7029
7030SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7031 SelectionDAG &DAG) const {
7032 SDLoc DL(Op);
7033 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7034 assert(LoadNode && "Expected custom lowering of a load node");
7035
7036 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7038 SDValue Base = LoadNode->getBasePtr();
7039 SDValue Chain = LoadNode->getChain();
7040 EVT PtrVT = Base.getValueType();
7041 for (unsigned i = 0; i < 8; i++) {
7042 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7043 DAG.getConstant(i * 8, DL, PtrVT));
7044 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
7045 LoadNode->getPointerInfo(),
7046 LoadNode->getOriginalAlign());
7047 Ops.push_back(Part);
7048 Chain = SDValue(Part.getNode(), 1);
7049 }
7050 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7051 return DAG.getMergeValues({Loaded, Chain}, DL);
7052 }
7053
7054 // Custom lowering for extending v4i8 vector loads.
7055 EVT VT = Op->getValueType(0);
7056 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7057
7058 if (LoadNode->getMemoryVT() != MVT::v4i8)
7059 return SDValue();
7060
7061 // Avoid generating unaligned loads.
7062 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7063 return SDValue();
7064
7065 unsigned ExtType;
7066 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7067 ExtType = ISD::SIGN_EXTEND;
7068 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7069 LoadNode->getExtensionType() == ISD::EXTLOAD)
7070 ExtType = ISD::ZERO_EXTEND;
7071 else
7072 return SDValue();
7073
7074 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7075 LoadNode->getBasePtr(), MachinePointerInfo());
7076 SDValue Chain = Load.getValue(1);
7077 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7078 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7079 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7080 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7081 DAG.getConstant(0, DL, MVT::i64));
7082 if (VT == MVT::v4i32)
7083 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7084 return DAG.getMergeValues({Ext, Chain}, DL);
7085}
7086
7087SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7088 SelectionDAG &DAG) const {
7089 SDLoc DL(Op);
7090 SDValue Vec = Op.getOperand(0);
7091 SDValue Mask = Op.getOperand(1);
7092 SDValue Passthru = Op.getOperand(2);
7093 EVT VecVT = Vec.getValueType();
7094 EVT MaskVT = Mask.getValueType();
7095 EVT ElmtVT = VecVT.getVectorElementType();
7096 const bool IsFixedLength = VecVT.isFixedLengthVector();
7097 const bool HasPassthru = !Passthru.isUndef();
7098 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7099 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7100
7101 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7102
7103 if (!Subtarget->isSVEAvailable())
7104 return SDValue();
7105
7106 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7107 return SDValue();
7108
7109 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7110 if (MinElmts != 2 && MinElmts != 4)
7111 return SDValue();
7112
7113 // We can use the SVE register containing the NEON vector in its lowest bits.
7114 if (IsFixedLength) {
7115 EVT ScalableVecVT =
7116 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7117 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7118 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7119
7120 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7121 DAG.getUNDEF(ScalableVecVT), Vec,
7122 DAG.getConstant(0, DL, MVT::i64));
7123 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7124 DAG.getUNDEF(ScalableMaskVT), Mask,
7125 DAG.getConstant(0, DL, MVT::i64));
7127 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7128 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7129 DAG.getUNDEF(ScalableVecVT), Passthru,
7130 DAG.getConstant(0, DL, MVT::i64));
7131
7132 VecVT = Vec.getValueType();
7133 MaskVT = Mask.getValueType();
7134 }
7135
7136 // Get legal type for compact instruction
7137 EVT ContainerVT = getSVEContainerType(VecVT);
7138 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7139
7140 // Convert to i32 or i64 for smaller types, as these are the only supported
7141 // sizes for compact.
7142 if (ContainerVT != VecVT) {
7143 Vec = DAG.getBitcast(CastVT, Vec);
7144 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7145 }
7146
7147 SDValue Compressed = DAG.getNode(
7149 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7150
7151 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7152 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7153 SDValue Offset = DAG.getNode(
7154 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7155 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7156
7157 SDValue IndexMask = DAG.getNode(
7158 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7159 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7160 DAG.getConstant(0, DL, MVT::i64), Offset);
7161
7162 Compressed =
7163 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7164 }
7165
7166 // Extracting from a legal SVE type before truncating produces better code.
7167 if (IsFixedLength) {
7168 Compressed = DAG.getNode(
7170 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7171 Compressed, DAG.getConstant(0, DL, MVT::i64));
7172 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7173 VecVT = FixedVecVT;
7174 }
7175
7176 // If we changed the element type before, we need to convert it back.
7177 if (ContainerVT != VecVT) {
7178 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7179 Compressed = DAG.getBitcast(VecVT, Compressed);
7180 }
7181
7182 return Compressed;
7183}
7184
7185// Generate SUBS and CSEL for integer abs.
7186SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7187 MVT VT = Op.getSimpleValueType();
7188
7189 if (VT.isVector())
7190 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7191
7192 SDLoc DL(Op);
7193 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7194 Op.getOperand(0));
7195 // Generate SUBS & CSEL.
7196 SDValue Cmp =
7197 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
7198 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7199 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7200 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
7201 Cmp.getValue(1));
7202}
7203
7205 SDValue Chain = Op.getOperand(0);
7206 SDValue Cond = Op.getOperand(1);
7207 SDValue Dest = Op.getOperand(2);
7208
7210 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7211 SDLoc dl(Op);
7212 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
7213 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
7214 Cmp);
7215 }
7216
7217 return SDValue();
7218}
7219
7220// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7221// FSHL is converted to FSHR before deciding what to do with it
7223 SDValue Shifts = Op.getOperand(2);
7224 // Check if the shift amount is a constant
7225 // If opcode is FSHL, convert it to FSHR
7226 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7227 SDLoc DL(Op);
7228 MVT VT = Op.getSimpleValueType();
7229
7230 if (Op.getOpcode() == ISD::FSHL) {
7231 unsigned int NewShiftNo =
7232 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
7233 return DAG.getNode(
7234 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7235 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7236 } else if (Op.getOpcode() == ISD::FSHR) {
7237 return Op;
7238 }
7239 }
7240
7241 return SDValue();
7242}
7243
7245 SDValue X = Op.getOperand(0);
7246 EVT XScalarTy = X.getValueType();
7247 SDValue Exp = Op.getOperand(1);
7248
7249 SDLoc DL(Op);
7250 EVT XVT, ExpVT;
7251 switch (Op.getSimpleValueType().SimpleTy) {
7252 default:
7253 return SDValue();
7254 case MVT::bf16:
7255 case MVT::f16:
7256 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7257 [[fallthrough]];
7258 case MVT::f32:
7259 XVT = MVT::nxv4f32;
7260 ExpVT = MVT::nxv4i32;
7261 break;
7262 case MVT::f64:
7263 XVT = MVT::nxv2f64;
7264 ExpVT = MVT::nxv2i64;
7265 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7266 break;
7267 }
7268
7269 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7270 SDValue VX =
7271 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7272 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7273 DAG.getUNDEF(ExpVT), Exp, Zero);
7274 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7275 AArch64SVEPredPattern::all);
7276 SDValue FScale =
7278 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7279 VPg, VX, VExp);
7280 SDValue Final =
7281 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7282 if (X.getValueType() != XScalarTy)
7283 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7284 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7285 return Final;
7286}
7287
7288SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7289 SelectionDAG &DAG) const {
7290 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7291 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7293 "ADJUST_TRAMPOLINE operation is only supported on Linux.");
7294
7295 return Op.getOperand(0);
7296}
7297
7298SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7299 SelectionDAG &DAG) const {
7300
7301 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7302 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7303 report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
7304
7305 SDValue Chain = Op.getOperand(0);
7306 SDValue Trmp = Op.getOperand(1); // trampoline
7307 SDValue FPtr = Op.getOperand(2); // nested function
7308 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7309 SDLoc dl(Op);
7310
7311 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7312 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
7313
7316
7317 Entry.Ty = IntPtrTy;
7318 Entry.Node = Trmp;
7319 Args.push_back(Entry);
7320
7321 if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
7323 MachineFrameInfo &MFI = MF.getFrameInfo();
7324 Entry.Node =
7325 DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
7326 } else
7327 Entry.Node = DAG.getConstant(36, dl, MVT::i64);
7328
7329 Args.push_back(Entry);
7330 Entry.Node = FPtr;
7331 Args.push_back(Entry);
7332 Entry.Node = Nest;
7333 Args.push_back(Entry);
7334
7335 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
7337 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
7339 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
7340
7341 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7342 return CallResult.second;
7343}
7344
7346 SelectionDAG &DAG) const {
7347 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7348 LLVM_DEBUG(Op.dump());
7349
7350 switch (Op.getOpcode()) {
7351 default:
7352 llvm_unreachable("unimplemented operand");
7353 return SDValue();
7354 case ISD::BITCAST:
7355 return LowerBITCAST(Op, DAG);
7356 case ISD::GlobalAddress:
7357 return LowerGlobalAddress(Op, DAG);
7359 return LowerGlobalTLSAddress(Op, DAG);
7361 return LowerPtrAuthGlobalAddress(Op, DAG);
7363 return LowerADJUST_TRAMPOLINE(Op, DAG);
7365 return LowerINIT_TRAMPOLINE(Op, DAG);
7366 case ISD::SETCC:
7367 case ISD::STRICT_FSETCC:
7369 return LowerSETCC(Op, DAG);
7370 case ISD::SETCCCARRY:
7371 return LowerSETCCCARRY(Op, DAG);
7372 case ISD::BRCOND:
7373 return LowerBRCOND(Op, DAG);
7374 case ISD::BR_CC:
7375 return LowerBR_CC(Op, DAG);
7376 case ISD::SELECT:
7377 return LowerSELECT(Op, DAG);
7378 case ISD::SELECT_CC:
7379 return LowerSELECT_CC(Op, DAG);
7380 case ISD::JumpTable:
7381 return LowerJumpTable(Op, DAG);
7382 case ISD::BR_JT:
7383 return LowerBR_JT(Op, DAG);
7384 case ISD::BRIND:
7385 return LowerBRIND(Op, DAG);
7386 case ISD::ConstantPool:
7387 return LowerConstantPool(Op, DAG);
7388 case ISD::BlockAddress:
7389 return LowerBlockAddress(Op, DAG);
7390 case ISD::VASTART:
7391 return LowerVASTART(Op, DAG);
7392 case ISD::VACOPY:
7393 return LowerVACOPY(Op, DAG);
7394 case ISD::VAARG:
7395 return LowerVAARG(Op, DAG);
7396 case ISD::UADDO_CARRY:
7397 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7398 case ISD::USUBO_CARRY:
7399 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7400 case ISD::SADDO_CARRY:
7401 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7402 case ISD::SSUBO_CARRY:
7403 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7404 case ISD::SADDO:
7405 case ISD::UADDO:
7406 case ISD::SSUBO:
7407 case ISD::USUBO:
7408 case ISD::SMULO:
7409 case ISD::UMULO:
7410 return LowerXALUO(Op, DAG);
7411 case ISD::FADD:
7412 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7413 case ISD::FSUB:
7414 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7415 case ISD::FMUL:
7416 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7417 case ISD::FMA:
7418 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7419 case ISD::FDIV:
7420 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7421 case ISD::FNEG:
7422 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7423 case ISD::FCEIL:
7424 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7425 case ISD::FFLOOR:
7426 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7427 case ISD::FNEARBYINT:
7428 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7429 case ISD::FRINT:
7430 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7431 case ISD::FROUND:
7432 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7433 case ISD::FROUNDEVEN:
7434 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7435 case ISD::FTRUNC:
7436 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7437 case ISD::FSQRT:
7438 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7439 case ISD::FABS:
7440 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7441 case ISD::FP_ROUND:
7443 return LowerFP_ROUND(Op, DAG);
7444 case ISD::FP_EXTEND:
7446 return LowerFP_EXTEND(Op, DAG);
7447 case ISD::FRAMEADDR:
7448 return LowerFRAMEADDR(Op, DAG);
7449 case ISD::SPONENTRY:
7450 return LowerSPONENTRY(Op, DAG);
7451 case ISD::RETURNADDR:
7452 return LowerRETURNADDR(Op, DAG);
7454 return LowerADDROFRETURNADDR(Op, DAG);
7456 return LowerCONCAT_VECTORS(Op, DAG);
7458 return LowerINSERT_VECTOR_ELT(Op, DAG);
7460 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7461 case ISD::BUILD_VECTOR:
7462 return LowerBUILD_VECTOR(Op, DAG);
7464 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7466 return LowerVECTOR_SHUFFLE(Op, DAG);
7467 case ISD::SPLAT_VECTOR:
7468 return LowerSPLAT_VECTOR(Op, DAG);
7470 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7472 return LowerINSERT_SUBVECTOR(Op, DAG);
7473 case ISD::SDIV:
7474 case ISD::UDIV:
7475 return LowerDIV(Op, DAG);
7476 case ISD::SMIN:
7477 case ISD::UMIN:
7478 case ISD::SMAX:
7479 case ISD::UMAX:
7480 return LowerMinMax(Op, DAG);
7481 case ISD::SRA:
7482 case ISD::SRL:
7483 case ISD::SHL:
7484 return LowerVectorSRA_SRL_SHL(Op, DAG);
7485 case ISD::SHL_PARTS:
7486 case ISD::SRL_PARTS:
7487 case ISD::SRA_PARTS:
7488 return LowerShiftParts(Op, DAG);
7489 case ISD::CTPOP:
7490 case ISD::PARITY:
7491 return LowerCTPOP_PARITY(Op, DAG);
7492 case ISD::FCOPYSIGN:
7493 return LowerFCOPYSIGN(Op, DAG);
7494 case ISD::OR:
7495 return LowerVectorOR(Op, DAG);
7496 case ISD::XOR:
7497 return LowerXOR(Op, DAG);
7498 case ISD::PREFETCH:
7499 return LowerPREFETCH(Op, DAG);
7500 case ISD::SINT_TO_FP:
7501 case ISD::UINT_TO_FP:
7504 return LowerINT_TO_FP(Op, DAG);
7505 case ISD::FP_TO_SINT:
7506 case ISD::FP_TO_UINT:
7509 return LowerFP_TO_INT(Op, DAG);
7512 return LowerFP_TO_INT_SAT(Op, DAG);
7513 case ISD::FSINCOS:
7514 return LowerFSINCOS(Op, DAG);
7515 case ISD::GET_ROUNDING:
7516 return LowerGET_ROUNDING(Op, DAG);
7517 case ISD::SET_ROUNDING:
7518 return LowerSET_ROUNDING(Op, DAG);
7519 case ISD::GET_FPMODE:
7520 return LowerGET_FPMODE(Op, DAG);
7521 case ISD::SET_FPMODE:
7522 return LowerSET_FPMODE(Op, DAG);
7523 case ISD::RESET_FPMODE:
7524 return LowerRESET_FPMODE(Op, DAG);
7525 case ISD::MUL:
7526 return LowerMUL(Op, DAG);
7527 case ISD::MULHS:
7528 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7529 case ISD::MULHU:
7530 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7532 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7534 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7536 return LowerINTRINSIC_VOID(Op, DAG);
7537 case ISD::ATOMIC_STORE:
7538 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7539 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7540 return LowerStore128(Op, DAG);
7541 }
7542 return SDValue();
7543 case ISD::STORE:
7544 return LowerSTORE(Op, DAG);
7545 case ISD::MSTORE:
7546 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7547 case ISD::MGATHER:
7548 return LowerMGATHER(Op, DAG);
7549 case ISD::MSCATTER:
7550 return LowerMSCATTER(Op, DAG);
7552 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7553 case ISD::VECREDUCE_ADD:
7554 case ISD::VECREDUCE_AND:
7555 case ISD::VECREDUCE_OR:
7556 case ISD::VECREDUCE_XOR:
7566 return LowerVECREDUCE(Op, DAG);
7568 return LowerATOMIC_LOAD_AND(Op, DAG);
7570 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7571 case ISD::VSCALE:
7572 return LowerVSCALE(Op, DAG);
7574 return LowerVECTOR_COMPRESS(Op, DAG);
7575 case ISD::ANY_EXTEND:
7576 case ISD::SIGN_EXTEND:
7577 case ISD::ZERO_EXTEND:
7578 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7579 case ISD::ADDRSPACECAST:
7580 return LowerADDRSPACECAST(Op, DAG);
7582 // Only custom lower when ExtraVT has a legal byte based element type.
7583 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7584 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7585 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7586 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7587 return SDValue();
7588
7589 return LowerToPredicatedOp(Op, DAG,
7591 }
7592 case ISD::TRUNCATE:
7593 return LowerTRUNCATE(Op, DAG);
7594 case ISD::MLOAD:
7595 return LowerMLOAD(Op, DAG);
7596 case ISD::LOAD:
7597 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7598 !Subtarget->isNeonAvailable()))
7599 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7600 return LowerLOAD(Op, DAG);
7601 case ISD::ADD:
7602 case ISD::AND:
7603 case ISD::SUB:
7604 return LowerToScalableOp(Op, DAG);
7605 case ISD::FMAXIMUM:
7606 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7607 case ISD::FMAXNUM:
7608 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7609 case ISD::FMINIMUM:
7610 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7611 case ISD::FMINNUM:
7612 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7613 case ISD::VSELECT:
7614 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7615 case ISD::ABS:
7616 return LowerABS(Op, DAG);
7617 case ISD::ABDS:
7618 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7619 case ISD::ABDU:
7620 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7621 case ISD::AVGFLOORS:
7622 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7623 case ISD::AVGFLOORU:
7624 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7625 case ISD::AVGCEILS:
7626 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7627 case ISD::AVGCEILU:
7628 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7629 case ISD::BITREVERSE:
7630 return LowerBitreverse(Op, DAG);
7631 case ISD::BSWAP:
7632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7633 case ISD::CTLZ:
7634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7635 case ISD::CTTZ:
7636 return LowerCTTZ(Op, DAG);
7637 case ISD::VECTOR_SPLICE:
7638 return LowerVECTOR_SPLICE(Op, DAG);
7640 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7642 return LowerVECTOR_INTERLEAVE(Op, DAG);
7643 case ISD::LRINT:
7644 case ISD::LLRINT:
7645 if (Op.getValueType().isVector())
7646 return LowerVectorXRINT(Op, DAG);
7647 [[fallthrough]];
7648 case ISD::LROUND:
7649 case ISD::LLROUND: {
7650 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7651 Op.getOperand(0).getValueType() == MVT::bf16) &&
7652 "Expected custom lowering of rounding operations only for f16");
7653 SDLoc DL(Op);
7654 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7655 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7656 }
7657 case ISD::STRICT_LROUND:
7659 case ISD::STRICT_LRINT:
7660 case ISD::STRICT_LLRINT: {
7661 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7662 Op.getOperand(1).getValueType() == MVT::bf16) &&
7663 "Expected custom lowering of rounding operations only for f16");
7664 SDLoc DL(Op);
7665 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7666 {Op.getOperand(0), Op.getOperand(1)});
7667 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7668 {Ext.getValue(1), Ext.getValue(0)});
7669 }
7670 case ISD::WRITE_REGISTER: {
7671 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7672 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7673 SDLoc DL(Op);
7674
7675 SDValue Chain = Op.getOperand(0);
7676 SDValue SysRegName = Op.getOperand(1);
7677 std::pair<SDValue, SDValue> Pair =
7678 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7679
7680 // chain = MSRR(chain, sysregname, lo, hi)
7681 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7682 SysRegName, Pair.first, Pair.second);
7683
7684 return Result;
7685 }
7686 case ISD::FSHL:
7687 case ISD::FSHR:
7688 return LowerFunnelShift(Op, DAG);
7689 case ISD::FLDEXP:
7690 return LowerFLDEXP(Op, DAG);
7692 return LowerVECTOR_HISTOGRAM(Op, DAG);
7693 }
7694}
7695
7697 return !Subtarget->useSVEForFixedLengthVectors();
7698}
7699
7701 EVT VT, bool OverrideNEON) const {
7702 if (!VT.isFixedLengthVector() || !VT.isSimple())
7703 return false;
7704
7705 // Don't use SVE for vectors we cannot scalarize if required.
7706 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7707 // Fixed length predicates should be promoted to i8.
7708 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7709 case MVT::i1:
7710 default:
7711 return false;
7712 case MVT::i8:
7713 case MVT::i16:
7714 case MVT::i32:
7715 case MVT::i64:
7716 case MVT::f16:
7717 case MVT::f32:
7718 case MVT::f64:
7719 break;
7720 }
7721
7722 // NEON-sized vectors can be emulated using SVE instructions.
7723 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7724 return Subtarget->isSVEorStreamingSVEAvailable();
7725
7726 // Ensure NEON MVTs only belong to a single register class.
7727 if (VT.getFixedSizeInBits() <= 128)
7728 return false;
7729
7730 // Ensure wider than NEON code generation is enabled.
7731 if (!Subtarget->useSVEForFixedLengthVectors())
7732 return false;
7733
7734 // Don't use SVE for types that don't fit.
7735 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7736 return false;
7737
7738 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7739 // the base fixed length SVE support in place.
7740 if (!VT.isPow2VectorType())
7741 return false;
7742
7743 return true;
7744}
7745
7746//===----------------------------------------------------------------------===//
7747// Calling Convention Implementation
7748//===----------------------------------------------------------------------===//
7749
7750static unsigned getIntrinsicID(const SDNode *N) {
7751 unsigned Opcode = N->getOpcode();
7752 switch (Opcode) {
7753 default:
7756 unsigned IID = N->getConstantOperandVal(0);
7757 if (IID < Intrinsic::num_intrinsics)
7758 return IID;
7760 }
7761 }
7762}
7763
7765 SDValue N1) const {
7766 if (!N0.hasOneUse())
7767 return false;
7768
7769 unsigned IID = getIntrinsicID(N1.getNode());
7770 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7771 if (IID == Intrinsic::aarch64_neon_umull ||
7772 N1.getOpcode() == AArch64ISD::UMULL ||
7773 IID == Intrinsic::aarch64_neon_smull ||
7775 return N0.getOpcode() != ISD::ADD;
7776
7777 return true;
7778}
7779
7780/// Selects the correct CCAssignFn for a given CallingConvention value.
7782 bool IsVarArg) const {
7783 switch (CC) {
7784 default:
7785 report_fatal_error("Unsupported calling convention.");
7786 case CallingConv::GHC:
7787 return CC_AArch64_GHC;
7789 // The VarArg implementation makes assumptions about register
7790 // argument passing that do not hold for preserve_none, so we
7791 // instead fall back to C argument passing.
7792 // The non-vararg case is handled in the CC function itself.
7793 if (!IsVarArg)
7795 [[fallthrough]];
7796 case CallingConv::C:
7797 case CallingConv::Fast:
7801 case CallingConv::Swift:
7803 case CallingConv::Tail:
7804 case CallingConv::GRAAL:
7805 if (Subtarget->isTargetWindows()) {
7806 if (IsVarArg) {
7807 if (Subtarget->isWindowsArm64EC())
7810 }
7811 return CC_AArch64_Win64PCS;
7812 }
7813 if (!Subtarget->isTargetDarwin())
7814 return CC_AArch64_AAPCS;
7815 if (!IsVarArg)
7816 return CC_AArch64_DarwinPCS;
7819 case CallingConv::Win64:
7820 if (IsVarArg) {
7821 if (Subtarget->isWindowsArm64EC())
7824 }
7825 return CC_AArch64_Win64PCS;
7827 if (Subtarget->isWindowsArm64EC())
7835 return CC_AArch64_AAPCS;
7840 }
7841}
7842
7843CCAssignFn *
7845 switch (CC) {
7846 default:
7847 return RetCC_AArch64_AAPCS;
7851 if (Subtarget->isWindowsArm64EC())
7853 return RetCC_AArch64_AAPCS;
7854 }
7855}
7856
7857static bool isPassedInFPR(EVT VT) {
7858 return VT.isFixedLengthVector() ||
7859 (VT.isFloatingPoint() && !VT.isScalableVector());
7860}
7861
7862SDValue AArch64TargetLowering::LowerFormalArguments(
7863 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7864 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7865 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7867 const Function &F = MF.getFunction();
7868 MachineFrameInfo &MFI = MF.getFrameInfo();
7869 bool IsWin64 =
7870 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7871 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7872 (isVarArg && Subtarget->isWindowsArm64EC());
7874
7876 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7878 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7879 FuncInfo->setIsSVECC(true);
7880
7881 // Assign locations to all of the incoming arguments.
7883 DenseMap<unsigned, SDValue> CopiedRegs;
7884 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7885
7886 // At this point, Ins[].VT may already be promoted to i32. To correctly
7887 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7888 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7889 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7890 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7891 // LocVT.
7892 unsigned NumArgs = Ins.size();
7893 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7894 unsigned CurArgIdx = 0;
7895 for (unsigned i = 0; i != NumArgs; ++i) {
7896 MVT ValVT = Ins[i].VT;
7897 if (Ins[i].isOrigArg()) {
7898 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7899 CurArgIdx = Ins[i].getOrigArgIndex();
7900
7901 // Get type of the original argument.
7902 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7903 /*AllowUnknown*/ true);
7904 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7905 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7906 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7907 ValVT = MVT::i8;
7908 else if (ActualMVT == MVT::i16)
7909 ValVT = MVT::i16;
7910 }
7911 bool UseVarArgCC = false;
7912 if (IsWin64)
7913 UseVarArgCC = isVarArg;
7914 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7915 bool Res =
7916 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7917 assert(!Res && "Call operand has unhandled type");
7918 (void)Res;
7919 }
7920
7922 bool IsLocallyStreaming =
7923 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7924 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7925 SDValue Glue = Chain.getValue(1);
7926
7927 SmallVector<SDValue, 16> ArgValues;
7928 unsigned ExtraArgLocs = 0;
7929 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7930 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7931
7932 if (Ins[i].Flags.isByVal()) {
7933 // Byval is used for HFAs in the PCS, but the system should work in a
7934 // non-compliant manner for larger structs.
7935 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7936 int Size = Ins[i].Flags.getByValSize();
7937 unsigned NumRegs = (Size + 7) / 8;
7938
7939 // FIXME: This works on big-endian for composite byvals, which are the common
7940 // case. It should also work for fundamental types too.
7941 unsigned FrameIdx =
7942 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7943 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7944 InVals.push_back(FrameIdxN);
7945
7946 continue;
7947 }
7948
7949 if (Ins[i].Flags.isSwiftAsync())
7951
7952 SDValue ArgValue;
7953 if (VA.isRegLoc()) {
7954 // Arguments stored in registers.
7955 EVT RegVT = VA.getLocVT();
7956 const TargetRegisterClass *RC;
7957
7958 if (RegVT == MVT::i32)
7959 RC = &AArch64::GPR32RegClass;
7960 else if (RegVT == MVT::i64)
7961 RC = &AArch64::GPR64RegClass;
7962 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7963 RC = &AArch64::FPR16RegClass;
7964 else if (RegVT == MVT::f32)
7965 RC = &AArch64::FPR32RegClass;
7966 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7967 RC = &AArch64::FPR64RegClass;
7968 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7969 RC = &AArch64::FPR128RegClass;
7970 else if (RegVT.isScalableVector() &&
7971 RegVT.getVectorElementType() == MVT::i1) {
7972 FuncInfo->setIsSVECC(true);
7973 RC = &AArch64::PPRRegClass;
7974 } else if (RegVT == MVT::aarch64svcount) {
7975 FuncInfo->setIsSVECC(true);
7976 RC = &AArch64::PPRRegClass;
7977 } else if (RegVT.isScalableVector()) {
7978 FuncInfo->setIsSVECC(true);
7979 RC = &AArch64::ZPRRegClass;
7980 } else
7981 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7982
7983 // Transform the arguments in physical registers into virtual ones.
7984 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7985
7986 if (IsLocallyStreaming) {
7987 // LocallyStreamingFunctions must insert the SMSTART in the correct
7988 // position, so we use Glue to ensure no instructions can be scheduled
7989 // between the chain of:
7990 // t0: ch,glue = EntryNode
7991 // t1: res,ch,glue = CopyFromReg
7992 // ...
7993 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7994 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7995 // ^^^^^^
7996 // This will be the new Chain/Root node.
7997 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7998 Glue = ArgValue.getValue(2);
7999 if (isPassedInFPR(ArgValue.getValueType())) {
8000 ArgValue =
8002 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8003 {ArgValue, Glue});
8004 Glue = ArgValue.getValue(1);
8005 }
8006 } else
8007 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8008
8009 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8010 // to 64 bits. Insert an assert[sz]ext to capture this, then
8011 // truncate to the right size.
8012 switch (VA.getLocInfo()) {
8013 default:
8014 llvm_unreachable("Unknown loc info!");
8015 case CCValAssign::Full:
8016 break;
8018 assert(
8019 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8020 "Indirect arguments should be scalable on most subtargets");
8021 break;
8022 case CCValAssign::BCvt:
8023 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8024 break;
8025 case CCValAssign::AExt:
8026 case CCValAssign::SExt:
8027 case CCValAssign::ZExt:
8028 break;
8030 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8031 DAG.getConstant(32, DL, RegVT));
8032 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8033 break;
8034 }
8035 } else { // VA.isRegLoc()
8036 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8037 unsigned ArgOffset = VA.getLocMemOffset();
8038 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8039 ? VA.getLocVT().getSizeInBits()
8040 : VA.getValVT().getSizeInBits()) / 8;
8041
8042 uint32_t BEAlign = 0;
8043 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8044 !Ins[i].Flags.isInConsecutiveRegs())
8045 BEAlign = 8 - ArgSize;
8046
8047 SDValue FIN;
8048 MachinePointerInfo PtrInfo;
8049 if (StackViaX4) {
8050 // In both the ARM64EC varargs convention and the thunk convention,
8051 // arguments on the stack are accessed relative to x4, not sp. In
8052 // the thunk convention, there's an additional offset of 32 bytes
8053 // to account for the shadow store.
8054 unsigned ObjOffset = ArgOffset + BEAlign;
8055 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8056 ObjOffset += 32;
8057 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8058 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8059 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8060 DAG.getConstant(ObjOffset, DL, MVT::i64));
8062 } else {
8063 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8064
8065 // Create load nodes to retrieve arguments from the stack.
8066 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8067 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8068 }
8069
8070 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8072 MVT MemVT = VA.getValVT();
8073
8074 switch (VA.getLocInfo()) {
8075 default:
8076 break;
8077 case CCValAssign::Trunc:
8078 case CCValAssign::BCvt:
8079 MemVT = VA.getLocVT();
8080 break;
8083 Subtarget->isWindowsArm64EC()) &&
8084 "Indirect arguments should be scalable on most subtargets");
8085 MemVT = VA.getLocVT();
8086 break;
8087 case CCValAssign::SExt:
8088 ExtType = ISD::SEXTLOAD;
8089 break;
8090 case CCValAssign::ZExt:
8091 ExtType = ISD::ZEXTLOAD;
8092 break;
8093 case CCValAssign::AExt:
8094 ExtType = ISD::EXTLOAD;
8095 break;
8096 }
8097
8098 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8099 MemVT);
8100 }
8101
8102 if (VA.getLocInfo() == CCValAssign::Indirect) {
8103 assert((VA.getValVT().isScalableVT() ||
8104 Subtarget->isWindowsArm64EC()) &&
8105 "Indirect arguments should be scalable on most subtargets");
8106
8107 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8108 unsigned NumParts = 1;
8109 if (Ins[i].Flags.isInConsecutiveRegs()) {
8110 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8111 ++NumParts;
8112 }
8113
8114 MVT PartLoad = VA.getValVT();
8115 SDValue Ptr = ArgValue;
8116
8117 // Ensure we generate all loads for each tuple part, whilst updating the
8118 // pointer after each load correctly using vscale.
8119 while (NumParts > 0) {
8120 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8121 InVals.push_back(ArgValue);
8122 NumParts--;
8123 if (NumParts > 0) {
8124 SDValue BytesIncrement;
8125 if (PartLoad.isScalableVector()) {
8126 BytesIncrement = DAG.getVScale(
8127 DL, Ptr.getValueType(),
8128 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8129 } else {
8130 BytesIncrement = DAG.getConstant(
8131 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8132 Ptr.getValueType());
8133 }
8134 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8135 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8136 ExtraArgLocs++;
8137 i++;
8138 }
8139 }
8140 } else {
8141 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8142 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8143 ArgValue, DAG.getValueType(MVT::i32));
8144
8145 // i1 arguments are zero-extended to i8 by the caller. Emit a
8146 // hint to reflect this.
8147 if (Ins[i].isOrigArg()) {
8148 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8149 if (OrigArg->getType()->isIntegerTy(1)) {
8150 if (!Ins[i].Flags.isZExt()) {
8151 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8152 ArgValue.getValueType(), ArgValue);
8153 }
8154 }
8155 }
8156
8157 InVals.push_back(ArgValue);
8158 }
8159 }
8160 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8161
8162 // Insert the SMSTART if this is a locally streaming function and
8163 // make sure it is Glued to the last CopyFromReg value.
8164 if (IsLocallyStreaming) {
8165 SDValue PStateSM;
8166 if (Attrs.hasStreamingCompatibleInterface()) {
8167 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8170 FuncInfo->setPStateSMReg(Reg);
8171 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
8172 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8174 } else
8175 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8177
8178 // Ensure that the SMSTART happens after the CopyWithChain such that its
8179 // chain result is used.
8180 for (unsigned I=0; I<InVals.size(); ++I) {
8183 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8184 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8185 InVals[I].getValueType());
8186 }
8187 }
8188
8189 // varargs
8190 if (isVarArg) {
8191 if (!Subtarget->isTargetDarwin() || IsWin64) {
8192 // The AAPCS variadic function ABI is identical to the non-variadic
8193 // one. As a result there may be more arguments in registers and we should
8194 // save them for future reference.
8195 // Win64 variadic functions also pass arguments in registers, but all float
8196 // arguments are passed in integer registers.
8197 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8198 }
8199
8200 // This will point to the next argument passed via stack.
8201 unsigned VarArgsOffset = CCInfo.getStackSize();
8202 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8203 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8204 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8205 FuncInfo->setVarArgsStackIndex(
8206 MFI.CreateFixedObject(4, VarArgsOffset, true));
8207
8208 if (MFI.hasMustTailInVarArgFunc()) {
8209 SmallVector<MVT, 2> RegParmTypes;
8210 RegParmTypes.push_back(MVT::i64);
8211 RegParmTypes.push_back(MVT::f128);
8212 // Compute the set of forwarded registers. The rest are scratch.
8214 FuncInfo->getForwardedMustTailRegParms();
8215 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8217
8218 // Conservatively forward X8, since it might be used for aggregate return.
8219 if (!CCInfo.isAllocated(AArch64::X8)) {
8220 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8221 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8222 }
8223 }
8224 }
8225
8226 // On Windows, InReg pointers must be returned, so record the pointer in a
8227 // virtual register at the start of the function so it can be returned in the
8228 // epilogue.
8229 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8230 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8231 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8232 Ins[I].Flags.isInReg()) &&
8233 Ins[I].Flags.isSRet()) {
8234 assert(!FuncInfo->getSRetReturnReg());
8235
8236 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8237 Register Reg =
8239 FuncInfo->setSRetReturnReg(Reg);
8240
8241 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8242 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8243 break;
8244 }
8245 }
8246 }
8247
8248 unsigned StackArgSize = CCInfo.getStackSize();
8249 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8250 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8251 // This is a non-standard ABI so by fiat I say we're allowed to make full
8252 // use of the stack area to be popped, which must be aligned to 16 bytes in
8253 // any case:
8254 StackArgSize = alignTo(StackArgSize, 16);
8255
8256 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8257 // a multiple of 16.
8258 FuncInfo->setArgumentStackToRestore(StackArgSize);
8259
8260 // This realignment carries over to the available bytes below. Our own
8261 // callers will guarantee the space is free by giving an aligned value to
8262 // CALLSEQ_START.
8263 }
8264 // Even if we're not expected to free up the space, it's useful to know how
8265 // much is there while considering tail calls (because we can reuse it).
8266 FuncInfo->setBytesInStackArgArea(StackArgSize);
8267
8268 if (Subtarget->hasCustomCallingConv())
8270
8271 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8272 // will be expanded and stored in the static object later using a pseudonode.
8273 if (SMEAttrs(MF.getFunction()).hasZAState()) {
8274 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8275 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8276 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8277 DAG.getConstant(1, DL, MVT::i32));
8278
8279 SDValue Buffer;
8280 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8282 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8283 } else {
8284 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8285 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8286 DAG.getVTList(MVT::i64, MVT::Other),
8287 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8288 MFI.CreateVariableSizedObject(Align(16), nullptr);
8289 }
8290 Chain = DAG.getNode(
8291 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8292 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
8293 } else if (SMEAttrs(MF.getFunction()).hasAgnosticZAInterface()) {
8294 // Call __arm_sme_state_size().
8295 SDValue BufferSize =
8297 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8298 Chain = BufferSize.getValue(1);
8299
8300 SDValue Buffer;
8301 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8302 Buffer =
8304 DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize});
8305 } else {
8306 // Allocate space dynamically.
8307 Buffer = DAG.getNode(
8308 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8309 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8310 MFI.CreateVariableSizedObject(Align(16), nullptr);
8311 }
8312
8313 // Copy the value to a virtual register, and save that in FuncInfo.
8314 Register BufferPtr =
8315 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8316 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8317 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8318 }
8319
8320 if (CallConv == CallingConv::PreserveNone) {
8321 for (const ISD::InputArg &I : Ins) {
8322 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8323 I.Flags.isSwiftAsync()) {
8326 MF.getFunction(),
8327 "Swift attributes can't be used with preserve_none",
8328 DL.getDebugLoc()));
8329 break;
8330 }
8331 }
8332 }
8333
8334 return Chain;
8335}
8336
8337void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8338 SelectionDAG &DAG,
8339 const SDLoc &DL,
8340 SDValue &Chain) const {
8342 MachineFrameInfo &MFI = MF.getFrameInfo();
8344 auto PtrVT = getPointerTy(DAG.getDataLayout());
8345 Function &F = MF.getFunction();
8346 bool IsWin64 =
8347 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8348
8350
8352 unsigned NumGPRArgRegs = GPRArgRegs.size();
8353 if (Subtarget->isWindowsArm64EC()) {
8354 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8355 // functions.
8356 NumGPRArgRegs = 4;
8357 }
8358 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8359
8360 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8361 int GPRIdx = 0;
8362 if (GPRSaveSize != 0) {
8363 if (IsWin64) {
8364 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8365 if (GPRSaveSize & 15)
8366 // The extra size here, if triggered, will always be 8.
8367 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8368 } else
8369 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8370
8371 SDValue FIN;
8372 if (Subtarget->isWindowsArm64EC()) {
8373 // With the Arm64EC ABI, we reserve the save area as usual, but we
8374 // compute its address relative to x4. For a normal AArch64->AArch64
8375 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8376 // different address.
8377 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8378 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8379 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8380 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8381 } else {
8382 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8383 }
8384
8385 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8386 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8387 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8388 SDValue Store =
8389 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8391 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8392 : MachinePointerInfo::getStack(MF, i * 8));
8393 MemOps.push_back(Store);
8394 FIN =
8395 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8396 }
8397 }
8398 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8399 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8400
8401 if (Subtarget->hasFPARMv8() && !IsWin64) {
8403 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8404 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8405
8406 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8407 int FPRIdx = 0;
8408 if (FPRSaveSize != 0) {
8409 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8410
8411 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8412
8413 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8414 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8415 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8416
8417 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8418 MachinePointerInfo::getStack(MF, i * 16));
8419 MemOps.push_back(Store);
8420 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8421 DAG.getConstant(16, DL, PtrVT));
8422 }
8423 }
8424 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8425 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8426 }
8427
8428 if (!MemOps.empty()) {
8429 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8430 }
8431}
8432
8433/// LowerCallResult - Lower the result values of a call into the
8434/// appropriate copies out of appropriate physical registers.
8435SDValue AArch64TargetLowering::LowerCallResult(
8436 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8437 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8438 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8439 SDValue ThisVal, bool RequiresSMChange) const {
8440 DenseMap<unsigned, SDValue> CopiedRegs;
8441 // Copy all of the result registers out of their specified physreg.
8442 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8443 CCValAssign VA = RVLocs[i];
8444
8445 // Pass 'this' value directly from the argument to return value, to avoid
8446 // reg unit interference
8447 if (i == 0 && isThisReturn) {
8448 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8449 "unexpected return calling convention register assignment");
8450 InVals.push_back(ThisVal);
8451 continue;
8452 }
8453
8454 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8455 // allows one use of a physreg per block.
8456 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8457 if (!Val) {
8458 Val =
8459 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8460 Chain = Val.getValue(1);
8461 InGlue = Val.getValue(2);
8462 CopiedRegs[VA.getLocReg()] = Val;
8463 }
8464
8465 switch (VA.getLocInfo()) {
8466 default:
8467 llvm_unreachable("Unknown loc info!");
8468 case CCValAssign::Full:
8469 break;
8470 case CCValAssign::BCvt:
8471 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8472 break;
8474 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8475 DAG.getConstant(32, DL, VA.getLocVT()));
8476 [[fallthrough]];
8477 case CCValAssign::AExt:
8478 [[fallthrough]];
8479 case CCValAssign::ZExt:
8480 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8481 break;
8482 }
8483
8484 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8486 Val);
8487
8488 InVals.push_back(Val);
8489 }
8490
8491 return Chain;
8492}
8493
8494/// Return true if the calling convention is one that we can guarantee TCO for.
8495static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8496 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8498}
8499
8500/// Return true if we might ever do TCO for calls with this calling convention.
8502 switch (CC) {
8503 case CallingConv::C:
8508 case CallingConv::Swift:
8510 case CallingConv::Tail:
8511 case CallingConv::Fast:
8512 return true;
8513 default:
8514 return false;
8515 }
8516}
8517
8518/// Return true if the call convention supports varargs
8519/// Currently only those that pass varargs like the C
8520/// calling convention does are eligible
8521/// Calling conventions listed in this function must also
8522/// be properly handled in AArch64Subtarget::isCallingConvWin64
8524 switch (CC) {
8525 case CallingConv::C:
8527 return true;
8528 default:
8529 return false;
8530 }
8531}
8532
8534 const AArch64Subtarget *Subtarget,
8536 CCState &CCInfo) {
8537 const SelectionDAG &DAG = CLI.DAG;
8538 CallingConv::ID CalleeCC = CLI.CallConv;
8539 bool IsVarArg = CLI.IsVarArg;
8540 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8541 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8542
8543 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8544 // for the shadow store.
8545 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8546 CCInfo.AllocateStack(32, Align(16));
8547
8548 unsigned NumArgs = Outs.size();
8549 for (unsigned i = 0; i != NumArgs; ++i) {
8550 MVT ArgVT = Outs[i].VT;
8551 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8552
8553 bool UseVarArgCC = false;
8554 if (IsVarArg) {
8555 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8556 // too, so use the vararg CC to force them to integer registers.
8557 if (IsCalleeWin64) {
8558 UseVarArgCC = true;
8559 } else {
8560 UseVarArgCC = !Outs[i].IsFixed;
8561 }
8562 }
8563
8564 if (!UseVarArgCC) {
8565 // Get type of the original argument.
8566 EVT ActualVT =
8567 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8568 /*AllowUnknown*/ true);
8569 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8570 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8571 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8572 ArgVT = MVT::i8;
8573 else if (ActualMVT == MVT::i16)
8574 ArgVT = MVT::i16;
8575 }
8576
8577 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8578 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
8579 assert(!Res && "Call operand has unhandled type");
8580 (void)Res;
8581 }
8582}
8583
8584bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8585 const CallLoweringInfo &CLI) const {
8586 CallingConv::ID CalleeCC = CLI.CallConv;
8587 if (!mayTailCallThisCC(CalleeCC))
8588 return false;
8589
8590 SDValue Callee = CLI.Callee;
8591 bool IsVarArg = CLI.IsVarArg;
8592 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8593 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8594 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8595 const SelectionDAG &DAG = CLI.DAG;
8597 const Function &CallerF = MF.getFunction();
8598 CallingConv::ID CallerCC = CallerF.getCallingConv();
8599
8600 // SME Streaming functions are not eligible for TCO as they may require
8601 // the streaming mode or ZA to be restored after returning from the call.
8602 SMEAttrs CallerAttrs(MF.getFunction());
8603 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
8604 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
8605 CallerAttrs.requiresLazySave(CalleeAttrs) ||
8606 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs) ||
8607 CallerAttrs.hasStreamingBody())
8608 return false;
8609
8610 // Functions using the C or Fast calling convention that have an SVE signature
8611 // preserve more registers and should assume the SVE_VectorCall CC.
8612 // The check for matching callee-saved regs will determine whether it is
8613 // eligible for TCO.
8614 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8617
8618 bool CCMatch = CallerCC == CalleeCC;
8619
8620 // When using the Windows calling convention on a non-windows OS, we want
8621 // to back up and restore X18 in such functions; we can't do a tail call
8622 // from those functions.
8623 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8624 CalleeCC != CallingConv::Win64)
8625 return false;
8626
8627 // Byval parameters hand the function a pointer directly into the stack area
8628 // we want to reuse during a tail call. Working around this *is* possible (see
8629 // X86) but less efficient and uglier in LowerCall.
8630 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8631 e = CallerF.arg_end();
8632 i != e; ++i) {
8633 if (i->hasByValAttr())
8634 return false;
8635
8636 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8637 // In this case, it is necessary to save/restore X0 in the callee. Tail
8638 // call opt interferes with this. So we disable tail call opt when the
8639 // caller has an argument with "inreg" attribute.
8640
8641 // FIXME: Check whether the callee also has an "inreg" argument.
8642 if (i->hasInRegAttr())
8643 return false;
8644 }
8645
8646 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8647 return CCMatch;
8648
8649 // Externally-defined functions with weak linkage should not be
8650 // tail-called on AArch64 when the OS does not support dynamic
8651 // pre-emption of symbols, as the AAELF spec requires normal calls
8652 // to undefined weak functions to be replaced with a NOP or jump to the
8653 // next instruction. The behaviour of branch instructions in this
8654 // situation (as used for tail calls) is implementation-defined, so we
8655 // cannot rely on the linker replacing the tail call with a return.
8656 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8657 const GlobalValue *GV = G->getGlobal();
8659 if (GV->hasExternalWeakLinkage() &&
8660 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8661 return false;
8662 }
8663
8664 // Now we search for cases where we can use a tail call without changing the
8665 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8666 // concept.
8667
8668 // I want anyone implementing a new calling convention to think long and hard
8669 // about this assert.
8670 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8671 report_fatal_error("Unsupported variadic calling convention");
8672
8673 LLVMContext &C = *DAG.getContext();
8674 // Check that the call results are passed in the same way.
8675 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8676 CCAssignFnForCall(CalleeCC, IsVarArg),
8677 CCAssignFnForCall(CallerCC, IsVarArg)))
8678 return false;
8679 // The callee has to preserve all registers the caller needs to preserve.
8680 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8681 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8682 if (!CCMatch) {
8683 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8684 if (Subtarget->hasCustomCallingConv()) {
8685 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8686 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8687 }
8688 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8689 return false;
8690 }
8691
8692 // Nothing more to check if the callee is taking no arguments
8693 if (Outs.empty())
8694 return true;
8695
8697 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8698
8699 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8700
8701 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8702 // When we are musttail, additional checks have been done and we can safely ignore this check
8703 // At least two cases here: if caller is fastcc then we can't have any
8704 // memory arguments (we'd be expected to clean up the stack afterwards). If
8705 // caller is C then we could potentially use its argument area.
8706
8707 // FIXME: for now we take the most conservative of these in both cases:
8708 // disallow all variadic memory operands.
8709 for (const CCValAssign &ArgLoc : ArgLocs)
8710 if (!ArgLoc.isRegLoc())
8711 return false;
8712 }
8713
8714 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8715
8716 // If any of the arguments is passed indirectly, it must be SVE, so the
8717 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8718 // allocate space on the stack. That is why we determine this explicitly here
8719 // the call cannot be a tailcall.
8720 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8721 assert((A.getLocInfo() != CCValAssign::Indirect ||
8722 A.getValVT().isScalableVector() ||
8723 Subtarget->isWindowsArm64EC()) &&
8724 "Expected value to be scalable");
8725 return A.getLocInfo() == CCValAssign::Indirect;
8726 }))
8727 return false;
8728
8729 // If the stack arguments for this call do not fit into our own save area then
8730 // the call cannot be made tail.
8731 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8732 return false;
8733
8734 const MachineRegisterInfo &MRI = MF.getRegInfo();
8735 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8736 return false;
8737
8738 return true;
8739}
8740
8741SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8742 SelectionDAG &DAG,
8743 MachineFrameInfo &MFI,
8744 int ClobberedFI) const {
8745 SmallVector<SDValue, 8> ArgChains;
8746 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8747 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8748
8749 // Include the original chain at the beginning of the list. When this is
8750 // used by target LowerCall hooks, this helps legalize find the
8751 // CALLSEQ_BEGIN node.
8752 ArgChains.push_back(Chain);
8753
8754 // Add a chain value for each stack argument corresponding
8755 for (SDNode *U : DAG.getEntryNode().getNode()->users())
8756 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8757 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8758 if (FI->getIndex() < 0) {
8759 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8760 int64_t InLastByte = InFirstByte;
8761 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8762
8763 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8764 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8765 ArgChains.push_back(SDValue(L, 1));
8766 }
8767
8768 // Build a tokenfactor for all the chains.
8769 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8770}
8771
8772bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8773 bool TailCallOpt) const {
8774 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8775 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8776}
8777
8778// Check if the value is zero-extended from i1 to i8
8779static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8780 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8781 if (SizeInBits < 8)
8782 return false;
8783
8784 APInt RequredZero(SizeInBits, 0xFE);
8785 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8786 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8787 return ZExtBool;
8788}
8789
8790// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
8791// input operands are copy nodes where the source register is in a
8792// StridedOrContiguous class. For example:
8793//
8794// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
8795// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
8796// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
8797// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
8798// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
8799// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
8800// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
8801//
8803 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
8804
8805 assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8806 MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) &&
8807 "Unexpected opcode.");
8808
8810 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8811 MachineOperand &MO = MI.getOperand(I);
8812 assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
8813
8814 MachineOperand *Def = MRI.getOneDef(MO.getReg());
8815 if (!Def || !Def->getParent()->isCopy())
8816 return false;
8817
8818 const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
8819 unsigned OpSubReg = CopySrc.getSubReg();
8821 SubReg = OpSubReg;
8822
8823 MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
8824 const TargetRegisterClass *CopySrcClass =
8825 MRI.getRegClass(CopySrcOp->getReg());
8826 if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
8827 (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
8828 CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass))
8829 return false;
8830 }
8831
8832 return true;
8833}
8834
8835void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8836 SDNode *Node) const {
8837 // Live-in physreg copies that are glued to SMSTART are applied as
8838 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8839 // register allocator to pass call args in callee saved regs, without extra
8840 // copies to avoid these fake clobbers of actually-preserved GPRs.
8841 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8842 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8843 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8844 if (MachineOperand &MO = MI.getOperand(I);
8845 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8846 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8847 AArch64::GPR64RegClass.contains(MO.getReg())))
8848 MI.removeOperand(I);
8849
8850 // The SVE vector length can change when entering/leaving streaming mode.
8851 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8852 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8853 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8854 /*IsImplicit=*/true));
8855 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8856 /*IsImplicit=*/true));
8857 }
8858 }
8859
8860 if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8861 MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
8862 // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
8863 // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
8865 return;
8866
8867 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8868 MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
8869 TII->get(TargetOpcode::REG_SEQUENCE),
8870 MI.getOperand(0).getReg());
8871
8872 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8873 MIB.add(MI.getOperand(I));
8874 MIB.addImm(AArch64::zsub0 + (I - 1));
8875 }
8876
8877 MI.eraseFromParent();
8878 return;
8879 }
8880
8881 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8882 // have nothing to do with VG, were it not that they are used to materialise a
8883 // frame-address. If they contain a frame-index to a scalable vector, this
8884 // will likely require an ADDVL instruction to materialise the address, thus
8885 // reading VG.
8886 const MachineFunction &MF = *MI.getMF();
8888 (MI.getOpcode() == AArch64::ADDXri ||
8889 MI.getOpcode() == AArch64::SUBXri)) {
8890 const MachineOperand &MO = MI.getOperand(1);
8891 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8893 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8894 /*IsImplicit=*/true));
8895 }
8896}
8897
8899 bool Enable, SDValue Chain,
8900 SDValue InGlue,
8901 unsigned Condition,
8902 SDValue PStateSM) const {
8905 FuncInfo->setHasStreamingModeChanges(true);
8906
8907 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8908 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8909 SDValue MSROp =
8910 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8911 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8912 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8913 if (Condition != AArch64SME::Always) {
8914 assert(PStateSM && "PStateSM should be defined");
8915 Ops.push_back(PStateSM);
8916 }
8917 Ops.push_back(RegMask);
8918
8919 if (InGlue)
8920 Ops.push_back(InGlue);
8921
8922 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8923 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8924}
8925
8926// Emit a call to __arm_sme_save or __arm_sme_restore.
8928 SelectionDAG &DAG,
8930 SDValue Chain, bool IsSave) {
8933 FuncInfo->setSMESaveBufferUsed();
8934
8937 Entry.Ty = PointerType::getUnqual(*DAG.getContext());
8938 Entry.Node =
8939 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
8940 Args.push_back(Entry);
8941
8942 SDValue Callee =
8943 DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
8944 TLI.getPointerTy(DAG.getDataLayout()));
8945 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8947 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8949 Callee, std::move(Args));
8950 return TLI.LowerCallTo(CLI).second;
8951}
8952
8953static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8954 const SMEAttrs &CalleeAttrs) {
8955 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8956 CallerAttrs.hasStreamingBody())
8957 return AArch64SME::Always;
8958 if (CalleeAttrs.hasNonStreamingInterface())
8960 if (CalleeAttrs.hasStreamingInterface())
8962
8963 llvm_unreachable("Unsupported attributes");
8964}
8965
8966/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8967/// and add input and output parameter nodes.
8968SDValue
8969AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8970 SmallVectorImpl<SDValue> &InVals) const {
8971 SelectionDAG &DAG = CLI.DAG;
8972 SDLoc &DL = CLI.DL;
8973 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8974 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8976 SDValue Chain = CLI.Chain;
8977 SDValue Callee = CLI.Callee;
8978 bool &IsTailCall = CLI.IsTailCall;
8979 CallingConv::ID &CallConv = CLI.CallConv;
8980 bool IsVarArg = CLI.IsVarArg;
8981
8984 bool IsThisReturn = false;
8985
8987 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8988 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8989 bool IsSibCall = false;
8990 bool GuardWithBTI = false;
8991
8992 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8993 !Subtarget->noBTIAtReturnTwice()) {
8994 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8995 }
8996
8997 // Analyze operands of the call, assigning locations to each operand.
8999 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9000
9001 if (IsVarArg) {
9002 unsigned NumArgs = Outs.size();
9003
9004 for (unsigned i = 0; i != NumArgs; ++i) {
9005 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
9006 report_fatal_error("Passing SVE types to variadic functions is "
9007 "currently not supported");
9008 }
9009 }
9010
9011 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9012
9013 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9014 // Assign locations to each value returned by this call.
9016 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9017 *DAG.getContext());
9018 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9019
9020 // Check callee args/returns for SVE registers and set calling convention
9021 // accordingly.
9022 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9023 auto HasSVERegLoc = [](CCValAssign &Loc) {
9024 if (!Loc.isRegLoc())
9025 return false;
9026 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9027 AArch64::PPRRegClass.contains(Loc.getLocReg());
9028 };
9029 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9031 }
9032
9033 if (IsTailCall) {
9034 // Check if it's really possible to do a tail call.
9035 IsTailCall = isEligibleForTailCallOptimization(CLI);
9036
9037 // A sibling call is one where we're under the usual C ABI and not planning
9038 // to change that but can still do a tail call:
9039 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
9040 CallConv != CallingConv::SwiftTail)
9041 IsSibCall = true;
9042
9043 if (IsTailCall)
9044 ++NumTailCalls;
9045 }
9046
9047 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9048 report_fatal_error("failed to perform tail call elimination on a call "
9049 "site marked musttail");
9050
9051 // Get a count of how many bytes are to be pushed on the stack.
9052 unsigned NumBytes = CCInfo.getStackSize();
9053
9054 if (IsSibCall) {
9055 // Since we're not changing the ABI to make this a tail call, the memory
9056 // operands are already available in the caller's incoming argument space.
9057 NumBytes = 0;
9058 }
9059
9060 // FPDiff is the byte offset of the call's argument area from the callee's.
9061 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9062 // by this amount for a tail call. In a sibling call it must be 0 because the
9063 // caller will deallocate the entire stack and the callee still expects its
9064 // arguments to begin at SP+0. Completely unused for non-tail calls.
9065 int FPDiff = 0;
9066
9067 if (IsTailCall && !IsSibCall) {
9068 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9069
9070 // Since callee will pop argument stack as a tail call, we must keep the
9071 // popped size 16-byte aligned.
9072 NumBytes = alignTo(NumBytes, 16);
9073
9074 // FPDiff will be negative if this tail call requires more space than we
9075 // would automatically have in our incoming argument space. Positive if we
9076 // can actually shrink the stack.
9077 FPDiff = NumReusableBytes - NumBytes;
9078
9079 // Update the required reserved area if this is the tail call requiring the
9080 // most argument stack space.
9081 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9082 FuncInfo->setTailCallReservedStack(-FPDiff);
9083
9084 // The stack pointer must be 16-byte aligned at all times it's used for a
9085 // memory operation, which in practice means at *all* times and in
9086 // particular across call boundaries. Therefore our own arguments started at
9087 // a 16-byte aligned SP and the delta applied for the tail call should
9088 // satisfy the same constraint.
9089 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9090 }
9091
9092 // Determine whether we need any streaming mode changes.
9093 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
9094 if (CLI.CB)
9095 CalleeAttrs = SMEAttrs(*CLI.CB);
9096 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9097 CalleeAttrs = SMEAttrs(ES->getSymbol());
9098
9099 auto DescribeCallsite =
9101 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9102 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9103 R << ore::NV("Callee", ES->getSymbol());
9104 else if (CLI.CB && CLI.CB->getCalledFunction())
9105 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9106 else
9107 R << "unknown callee";
9108 R << "'";
9109 return R;
9110 };
9111
9112 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
9113 bool RequiresSaveAllZA =
9114 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs);
9115 if (RequiresLazySave) {
9116 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9117 MachinePointerInfo MPI =
9119 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9120 TPIDR2.FrameIndex,
9122 SDValue NumZaSaveSlicesAddr =
9123 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
9124 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
9125 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9126 DAG.getConstant(1, DL, MVT::i32));
9127 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
9128 MPI, MVT::i16);
9129 Chain = DAG.getNode(
9130 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9131 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9132 TPIDR2ObjAddr);
9134 ORE.emit([&]() {
9135 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9136 CLI.CB)
9137 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9138 &MF.getFunction());
9139 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9140 });
9141 } else if (RequiresSaveAllZA) {
9142 assert(!CalleeAttrs.hasSharedZAInterface() &&
9143 "Cannot share state that may not exist");
9144 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9145 /*IsSave=*/true);
9146 }
9147
9148 SDValue PStateSM;
9149 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
9150 if (RequiresSMChange) {
9151 if (CallerAttrs.hasStreamingInterfaceOrBody())
9152 PStateSM = DAG.getConstant(1, DL, MVT::i64);
9153 else if (CallerAttrs.hasNonStreamingInterface())
9154 PStateSM = DAG.getConstant(0, DL, MVT::i64);
9155 else
9156 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
9158 ORE.emit([&]() {
9159 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9160 CLI.CB)
9161 : OptimizationRemarkAnalysis("sme", "SMETransition",
9162 &MF.getFunction());
9163 DescribeCallsite(R) << " requires a streaming mode transition";
9164 return R;
9165 });
9166 }
9167
9168 SDValue ZTFrameIdx;
9169 MachineFrameInfo &MFI = MF.getFrameInfo();
9170 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
9171
9172 // If the caller has ZT0 state which will not be preserved by the callee,
9173 // spill ZT0 before the call.
9174 if (ShouldPreserveZT0) {
9175 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
9176 ZTFrameIdx = DAG.getFrameIndex(
9177 ZTObj,
9179
9180 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9181 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9182 }
9183
9184 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9185 // PSTATE.ZA before the call if there is no lazy-save active.
9186 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
9187 assert((!DisableZA || !RequiresLazySave) &&
9188 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9189
9190 if (DisableZA)
9191 Chain = DAG.getNode(
9192 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
9193 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9194 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9195
9196 // Adjust the stack pointer for the new arguments...
9197 // These operations are automatically eliminated by the prolog/epilog pass
9198 if (!IsSibCall)
9199 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9200
9201 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9203
9205 SmallSet<unsigned, 8> RegsUsed;
9206 SmallVector<SDValue, 8> MemOpChains;
9207 auto PtrVT = getPointerTy(DAG.getDataLayout());
9208
9209 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9210 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9211 for (const auto &F : Forwards) {
9212 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9213 RegsToPass.emplace_back(F.PReg, Val);
9214 }
9215 }
9216
9217 // Walk the register/memloc assignments, inserting copies/loads.
9218 unsigned ExtraArgLocs = 0;
9219 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9220 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9221 SDValue Arg = OutVals[i];
9222 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9223
9224 // Promote the value if needed.
9225 switch (VA.getLocInfo()) {
9226 default:
9227 llvm_unreachable("Unknown loc info!");
9228 case CCValAssign::Full:
9229 break;
9230 case CCValAssign::SExt:
9231 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9232 break;
9233 case CCValAssign::ZExt:
9234 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9235 break;
9236 case CCValAssign::AExt:
9237 if (Outs[i].ArgVT == MVT::i1) {
9238 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9239 //
9240 // Check if we actually have to do this, because the value may
9241 // already be zero-extended.
9242 //
9243 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9244 // and rely on DAGCombiner to fold this, because the following
9245 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9246 //
9247 // (ext (zext x)) -> (zext x)
9248 //
9249 // This will give us (zext i32), which we cannot remove, so
9250 // try to check this beforehand.
9251 if (!checkZExtBool(Arg, DAG)) {
9252 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9253 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9254 }
9255 }
9256 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9257 break;
9259 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9260 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9261 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9262 DAG.getConstant(32, DL, VA.getLocVT()));
9263 break;
9264 case CCValAssign::BCvt:
9265 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9266 break;
9267 case CCValAssign::Trunc:
9268 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9269 break;
9270 case CCValAssign::FPExt:
9271 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9272 break;
9274 bool isScalable = VA.getValVT().isScalableVT();
9275 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9276 "Indirect arguments should be scalable on most subtargets");
9277
9278 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9279 uint64_t PartSize = StoreSize;
9280 unsigned NumParts = 1;
9281 if (Outs[i].Flags.isInConsecutiveRegs()) {
9282 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9283 ++NumParts;
9284 StoreSize *= NumParts;
9285 }
9286
9287 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9288 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9289 MachineFrameInfo &MFI = MF.getFrameInfo();
9290 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9291 if (isScalable)
9293
9297 SDValue SpillSlot = Ptr;
9298
9299 // Ensure we generate all stores for each tuple part, whilst updating the
9300 // pointer after each store correctly using vscale.
9301 while (NumParts) {
9302 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9303 MemOpChains.push_back(Store);
9304
9305 NumParts--;
9306 if (NumParts > 0) {
9307 SDValue BytesIncrement;
9308 if (isScalable) {
9309 BytesIncrement = DAG.getVScale(
9310 DL, Ptr.getValueType(),
9311 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9312 } else {
9313 BytesIncrement = DAG.getConstant(
9314 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9315 Ptr.getValueType());
9316 }
9317 MPI = MachinePointerInfo(MPI.getAddrSpace());
9318 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9319 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9320 ExtraArgLocs++;
9321 i++;
9322 }
9323 }
9324
9325 Arg = SpillSlot;
9326 break;
9327 }
9328
9329 if (VA.isRegLoc()) {
9330 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9331 Outs[0].VT == MVT::i64) {
9332 assert(VA.getLocVT() == MVT::i64 &&
9333 "unexpected calling convention register assignment");
9334 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9335 "unexpected use of 'returned'");
9336 IsThisReturn = true;
9337 }
9338 if (RegsUsed.count(VA.getLocReg())) {
9339 // If this register has already been used then we're trying to pack
9340 // parts of an [N x i32] into an X-register. The extension type will
9341 // take care of putting the two halves in the right place but we have to
9342 // combine them.
9343 SDValue &Bits =
9344 llvm::find_if(RegsToPass,
9345 [=](const std::pair<unsigned, SDValue> &Elt) {
9346 return Elt.first == VA.getLocReg();
9347 })
9348 ->second;
9349 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9350 // Call site info is used for function's parameter entry value
9351 // tracking. For now we track only simple cases when parameter
9352 // is transferred through whole register.
9354 [&VA](MachineFunction::ArgRegPair ArgReg) {
9355 return ArgReg.Reg == VA.getLocReg();
9356 });
9357 } else {
9358 // Add an extra level of indirection for streaming mode changes by
9359 // using a pseudo copy node that cannot be rematerialised between a
9360 // smstart/smstop and the call by the simple register coalescer.
9361 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9363 Arg.getValueType(), Arg);
9364 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9365 RegsUsed.insert(VA.getLocReg());
9366 const TargetOptions &Options = DAG.getTarget().Options;
9367 if (Options.EmitCallSiteInfo)
9368 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9369 }
9370 } else {
9371 assert(VA.isMemLoc());
9372
9373 SDValue DstAddr;
9374 MachinePointerInfo DstInfo;
9375
9376 // FIXME: This works on big-endian for composite byvals, which are the
9377 // common case. It should also work for fundamental types too.
9378 uint32_t BEAlign = 0;
9379 unsigned OpSize;
9380 if (VA.getLocInfo() == CCValAssign::Indirect ||
9382 OpSize = VA.getLocVT().getFixedSizeInBits();
9383 else
9384 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9385 : VA.getValVT().getSizeInBits();
9386 OpSize = (OpSize + 7) / 8;
9387 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9388 !Flags.isInConsecutiveRegs()) {
9389 if (OpSize < 8)
9390 BEAlign = 8 - OpSize;
9391 }
9392 unsigned LocMemOffset = VA.getLocMemOffset();
9393 int32_t Offset = LocMemOffset + BEAlign;
9394 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9395 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9396
9397 if (IsTailCall) {
9398 Offset = Offset + FPDiff;
9399 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9400
9401 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9402 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9403
9404 // Make sure any stack arguments overlapping with where we're storing
9405 // are loaded before this eventual operation. Otherwise they'll be
9406 // clobbered.
9407 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9408 } else {
9409 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9410
9411 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9412 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9413 }
9414
9415 if (Outs[i].Flags.isByVal()) {
9416 SDValue SizeNode =
9417 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9418 SDValue Cpy = DAG.getMemcpy(
9419 Chain, DL, DstAddr, Arg, SizeNode,
9420 Outs[i].Flags.getNonZeroByValAlign(),
9421 /*isVol = */ false, /*AlwaysInline = */ false,
9422 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9423
9424 MemOpChains.push_back(Cpy);
9425 } else {
9426 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9427 // promoted to a legal register type i32, we should truncate Arg back to
9428 // i1/i8/i16.
9429 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9430 VA.getValVT() == MVT::i16)
9431 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9432
9433 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9434 MemOpChains.push_back(Store);
9435 }
9436 }
9437 }
9438
9439 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
9440 SDValue ParamPtr = StackPtr;
9441 if (IsTailCall) {
9442 // Create a dummy object at the top of the stack that can be used to get
9443 // the SP after the epilogue
9444 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9445 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9446 }
9447
9448 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9449 // describing the argument list. x4 contains the address of the
9450 // first stack parameter. x5 contains the size in bytes of all parameters
9451 // passed on the stack.
9452 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9453 RegsToPass.emplace_back(AArch64::X5,
9454 DAG.getConstant(NumBytes, DL, MVT::i64));
9455 }
9456
9457 if (!MemOpChains.empty())
9458 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9459
9460 SDValue InGlue;
9461 if (RequiresSMChange) {
9462 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9463 Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
9464 DAG.getVTList(MVT::Other, MVT::Glue), Chain);
9465 InGlue = Chain.getValue(1);
9466 }
9467
9468 SDValue NewChain = changeStreamingMode(
9469 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
9470 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9471 Chain = NewChain.getValue(0);
9472 InGlue = NewChain.getValue(1);
9473 }
9474
9475 // Build a sequence of copy-to-reg nodes chained together with token chain
9476 // and flag operands which copy the outgoing args into the appropriate regs.
9477 for (auto &RegToPass : RegsToPass) {
9478 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9479 RegToPass.second, InGlue);
9480 InGlue = Chain.getValue(1);
9481 }
9482
9483 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9484 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9485 // node so that legalize doesn't hack it.
9486 const GlobalValue *CalledGlobal = nullptr;
9487 unsigned OpFlags = 0;
9488 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9489 CalledGlobal = G->getGlobal();
9490 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9492 if (OpFlags & AArch64II::MO_GOT) {
9493 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9494 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9495 } else {
9496 const GlobalValue *GV = G->getGlobal();
9497 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9498 }
9499 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9500 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9501 Subtarget->isTargetMachO()) ||
9503 const char *Sym = S->getSymbol();
9504 if (UseGot) {
9506 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9507 } else {
9508 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9509 }
9510 }
9511
9512 // We don't usually want to end the call-sequence here because we would tidy
9513 // the frame up *after* the call, however in the ABI-changing tail-call case
9514 // we've carefully laid out the parameters so that when sp is reset they'll be
9515 // in the correct location.
9516 if (IsTailCall && !IsSibCall) {
9517 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9518 InGlue = Chain.getValue(1);
9519 }
9520
9521 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9522
9523 std::vector<SDValue> Ops;
9524 Ops.push_back(Chain);
9525 Ops.push_back(Callee);
9526
9527 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9528 // be expanded to the call, directly followed by a special marker sequence and
9529 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9530 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9531 assert(!IsTailCall &&
9532 "tail calls cannot be marked with clang.arc.attachedcall");
9534
9535 // Add a target global address for the retainRV/claimRV runtime function
9536 // just before the call target.
9537 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9538 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9539 Ops.insert(Ops.begin() + 1, GA);
9540 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9542 } else if (GuardWithBTI) {
9544 }
9545
9546 if (IsTailCall) {
9547 // Each tail call may have to adjust the stack by a different amount, so
9548 // this information must travel along with the operation for eventual
9549 // consumption by emitEpilogue.
9550 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9551 }
9552
9553 if (CLI.PAI) {
9554 const uint64_t Key = CLI.PAI->Key;
9555 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9556 "Invalid auth call key");
9557
9558 // Split the discriminator into address/integer components.
9559 SDValue AddrDisc, IntDisc;
9560 std::tie(IntDisc, AddrDisc) =
9561 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9562
9563 if (Opc == AArch64ISD::CALL_RVMARKER)
9565 else
9567 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9568 Ops.push_back(IntDisc);
9569 Ops.push_back(AddrDisc);
9570 }
9571
9572 // Add argument registers to the end of the list so that they are known live
9573 // into the call.
9574 for (auto &RegToPass : RegsToPass)
9575 Ops.push_back(DAG.getRegister(RegToPass.first,
9576 RegToPass.second.getValueType()));
9577
9578 // Add a register mask operand representing the call-preserved registers.
9579 const uint32_t *Mask;
9580 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9581 if (IsThisReturn) {
9582 // For 'this' returns, use the X0-preserving mask if applicable
9583 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9584 if (!Mask) {
9585 IsThisReturn = false;
9586 Mask = TRI->getCallPreservedMask(MF, CallConv);
9587 }
9588 } else
9589 Mask = TRI->getCallPreservedMask(MF, CallConv);
9590
9591 if (Subtarget->hasCustomCallingConv())
9592 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9593
9594 if (TRI->isAnyArgRegReserved(MF))
9595 TRI->emitReservedArgRegCallError(MF);
9596
9597 assert(Mask && "Missing call preserved mask for calling convention");
9598 Ops.push_back(DAG.getRegisterMask(Mask));
9599
9600 if (InGlue.getNode())
9601 Ops.push_back(InGlue);
9602
9603 // If we're doing a tall call, use a TC_RETURN here rather than an
9604 // actual call instruction.
9605 if (IsTailCall) {
9607 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9608 if (IsCFICall)
9609 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9610
9611 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9612 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9613 if (CalledGlobal &&
9614 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9615 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9616 return Ret;
9617 }
9618
9619 // Returns a chain and a flag for retval copy to use.
9620 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9621 if (IsCFICall)
9622 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9623
9624 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9625 InGlue = Chain.getValue(1);
9626 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9627 if (CalledGlobal &&
9628 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9629 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9630
9631 uint64_t CalleePopBytes =
9632 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9633
9634 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9635 InGlue = Chain.getValue(1);
9636
9637 // Handle result values, copying them out of physregs into vregs that we
9638 // return.
9639 SDValue Result = LowerCallResult(
9640 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9641 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9642
9643 if (!Ins.empty())
9644 InGlue = Result.getValue(Result->getNumValues() - 1);
9645
9646 if (RequiresSMChange) {
9647 assert(PStateSM && "Expected a PStateSM to be set");
9649 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
9650 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9651
9652 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9653 InGlue = Result.getValue(1);
9654 Result =
9656 DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
9657 }
9658 }
9659
9660 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
9661 // Unconditionally resume ZA.
9662 Result = DAG.getNode(
9663 AArch64ISD::SMSTART, DL, MVT::Other, Result,
9664 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9665 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9666
9667 if (ShouldPreserveZT0)
9668 Result =
9669 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
9670 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9671
9672 if (RequiresLazySave) {
9673 // Conditionally restore the lazy save using a pseudo node.
9674 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9675 SDValue RegMask = DAG.getRegisterMask(
9676 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
9677 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9678 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
9679 SDValue TPIDR2_EL0 = DAG.getNode(
9680 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
9681 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
9682
9683 // Copy the address of the TPIDR2 block into X0 before 'calling' the
9684 // RESTORE_ZA pseudo.
9685 SDValue Glue;
9686 SDValue TPIDR2Block = DAG.getFrameIndex(
9687 TPIDR2.FrameIndex,
9689 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
9690 Result =
9691 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
9692 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
9693 RestoreRoutine, RegMask, Result.getValue(1)});
9694
9695 // Finally reset the TPIDR2_EL0 register to 0.
9696 Result = DAG.getNode(
9697 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
9698 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9699 DAG.getConstant(0, DL, MVT::i64));
9700 TPIDR2.Uses++;
9701 } else if (RequiresSaveAllZA) {
9702 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
9703 /*IsSave=*/false);
9704 }
9705
9706 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9707 RequiresSaveAllZA) {
9708 for (unsigned I = 0; I < InVals.size(); ++I) {
9709 // The smstart/smstop is chained as part of the call, but when the
9710 // resulting chain is discarded (which happens when the call is not part
9711 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9712 // smstart/smstop is chained to the result value. We can do that by doing
9713 // a vreg -> vreg copy.
9716 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
9717 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
9718 InVals[I].getValueType());
9719 }
9720 }
9721
9722 if (CallConv == CallingConv::PreserveNone) {
9723 for (const ISD::OutputArg &O : Outs) {
9724 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9725 O.Flags.isSwiftAsync()) {
9728 MF.getFunction(),
9729 "Swift attributes can't be used with preserve_none",
9730 DL.getDebugLoc()));
9731 break;
9732 }
9733 }
9734 }
9735
9736 return Result;
9737}
9738
9739bool AArch64TargetLowering::CanLowerReturn(
9740 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9741 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
9742 const Type *RetTy) const {
9743 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9745 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9746 return CCInfo.CheckReturn(Outs, RetCC);
9747}
9748
9749SDValue
9750AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9751 bool isVarArg,
9753 const SmallVectorImpl<SDValue> &OutVals,
9754 const SDLoc &DL, SelectionDAG &DAG) const {
9755 auto &MF = DAG.getMachineFunction();
9756 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9757
9758 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9760 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9761 CCInfo.AnalyzeReturn(Outs, RetCC);
9762
9763 // Copy the result values into the output registers.
9764 SDValue Glue;
9766 SmallSet<unsigned, 4> RegsUsed;
9767 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9768 ++i, ++realRVLocIdx) {
9769 CCValAssign &VA = RVLocs[i];
9770 assert(VA.isRegLoc() && "Can only return in registers!");
9771 SDValue Arg = OutVals[realRVLocIdx];
9772
9773 switch (VA.getLocInfo()) {
9774 default:
9775 llvm_unreachable("Unknown loc info!");
9776 case CCValAssign::Full:
9777 if (Outs[i].ArgVT == MVT::i1) {
9778 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9779 // value. This is strictly redundant on Darwin (which uses "zeroext
9780 // i1"), but will be optimised out before ISel.
9781 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9782 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9783 }
9784 break;
9785 case CCValAssign::BCvt:
9786 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9787 break;
9788 case CCValAssign::AExt:
9789 case CCValAssign::ZExt:
9790 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9791 break;
9793 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9794 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9795 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9796 DAG.getConstant(32, DL, VA.getLocVT()));
9797 break;
9798 }
9799
9800 if (RegsUsed.count(VA.getLocReg())) {
9801 SDValue &Bits =
9802 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
9803 return Elt.first == VA.getLocReg();
9804 })->second;
9805 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9806 } else {
9807 RetVals.emplace_back(VA.getLocReg(), Arg);
9808 RegsUsed.insert(VA.getLocReg());
9809 }
9810 }
9811
9812 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9813
9814 // Emit SMSTOP before returning from a locally streaming function
9815 SMEAttrs FuncAttrs(MF.getFunction());
9816 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9817 if (FuncAttrs.hasStreamingCompatibleInterface()) {
9818 Register Reg = FuncInfo->getPStateSMReg();
9819 assert(Reg.isValid() && "PStateSM Register is invalid");
9820 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
9821 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9822 /*Glue*/ SDValue(),
9824 } else
9825 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9826 /*Glue*/ SDValue(), AArch64SME::Always);
9827 Glue = Chain.getValue(1);
9828 }
9829
9830 SmallVector<SDValue, 4> RetOps(1, Chain);
9831 for (auto &RetVal : RetVals) {
9832 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9833 isPassedInFPR(RetVal.second.getValueType()))
9834 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9835 RetVal.second.getValueType(), RetVal.second);
9836 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
9837 Glue = Chain.getValue(1);
9838 RetOps.push_back(
9839 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
9840 }
9841
9842 // Windows AArch64 ABIs require that for returning structs by value we copy
9843 // the sret argument into X0 for the return.
9844 // We saved the argument into a virtual register in the entry block,
9845 // so now we copy the value out and into X0.
9846 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9847 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
9849
9850 unsigned RetValReg = AArch64::X0;
9851 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9852 RetValReg = AArch64::X8;
9853 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9854 Glue = Chain.getValue(1);
9855
9856 RetOps.push_back(
9857 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9858 }
9859
9860 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9861 if (I) {
9862 for (; *I; ++I) {
9863 if (AArch64::GPR64RegClass.contains(*I))
9864 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9865 else if (AArch64::FPR64RegClass.contains(*I))
9866 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9867 else
9868 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9869 }
9870 }
9871
9872 RetOps[0] = Chain; // Update chain.
9873
9874 // Add the glue if we have it.
9875 if (Glue.getNode())
9876 RetOps.push_back(Glue);
9877
9878 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9879 // ARM64EC entry thunks use a special return sequence: instead of a regular
9880 // "ret" instruction, they need to explicitly call the emulator.
9881 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9882 SDValue Arm64ECRetDest =
9883 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9884 Arm64ECRetDest =
9885 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9886 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9888 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9889 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9890 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9891 }
9892
9893 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9894}
9895
9896//===----------------------------------------------------------------------===//
9897// Other Lowering Code
9898//===----------------------------------------------------------------------===//
9899
9900SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9901 SelectionDAG &DAG,
9902 unsigned Flag) const {
9903 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9904 N->getOffset(), Flag);
9905}
9906
9907SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9908 SelectionDAG &DAG,
9909 unsigned Flag) const {
9910 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9911}
9912
9913SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9914 SelectionDAG &DAG,
9915 unsigned Flag) const {
9916 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9917 N->getOffset(), Flag);
9918}
9919
9920SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9921 SelectionDAG &DAG,
9922 unsigned Flag) const {
9923 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9924}
9925
9926SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9927 SelectionDAG &DAG,
9928 unsigned Flag) const {
9929 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9930}
9931
9932// (loadGOT sym)
9933template <class NodeTy>
9934SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9935 unsigned Flags) const {
9936 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9937 SDLoc DL(N);
9938 EVT Ty = getPointerTy(DAG.getDataLayout());
9939 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9940 // FIXME: Once remat is capable of dealing with instructions with register
9941 // operands, expand this into two nodes instead of using a wrapper node.
9942 if (DAG.getMachineFunction()
9944 ->hasELFSignedGOT())
9945 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
9946 0);
9947 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9948}
9949
9950// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9951template <class NodeTy>
9952SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9953 unsigned Flags) const {
9954 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9955 SDLoc DL(N);
9956 EVT Ty = getPointerTy(DAG.getDataLayout());
9957 const unsigned char MO_NC = AArch64II::MO_NC;
9958 return DAG.getNode(
9960 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9961 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9962 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9963 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9964}
9965
9966// (addlow (adrp %hi(sym)) %lo(sym))
9967template <class NodeTy>
9968SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9969 unsigned Flags) const {
9970 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9971 SDLoc DL(N);
9972 EVT Ty = getPointerTy(DAG.getDataLayout());
9973 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9974 SDValue Lo = getTargetNode(N, Ty, DAG,
9977 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9978}
9979
9980// (adr sym)
9981template <class NodeTy>
9982SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9983 unsigned Flags) const {
9984 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9985 SDLoc DL(N);
9986 EVT Ty = getPointerTy(DAG.getDataLayout());
9987 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9988 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9989}
9990
9991SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9992 SelectionDAG &DAG) const {
9993 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9994 const GlobalValue *GV = GN->getGlobal();
9995 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9996
9997 if (OpFlags != AArch64II::MO_NO_FLAG)
9998 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9999 "unexpected offset in global node");
10000
10001 // This also catches the large code model case for Darwin, and tiny code
10002 // model with got relocations.
10003 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10004 return getGOT(GN, DAG, OpFlags);
10005 }
10006
10010 Result = getAddrLarge(GN, DAG, OpFlags);
10011 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10012 Result = getAddrTiny(GN, DAG, OpFlags);
10013 } else {
10014 Result = getAddr(GN, DAG, OpFlags);
10015 }
10016 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10017 SDLoc DL(GN);
10019 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10021 return Result;
10022}
10023
10024/// Convert a TLS address reference into the correct sequence of loads
10025/// and calls to compute the variable's address (for Darwin, currently) and
10026/// return an SDValue containing the final node.
10027
10028/// Darwin only has one TLS scheme which must be capable of dealing with the
10029/// fully general situation, in the worst case. This means:
10030/// + "extern __thread" declaration.
10031/// + Defined in a possibly unknown dynamic library.
10032///
10033/// The general system is that each __thread variable has a [3 x i64] descriptor
10034/// which contains information used by the runtime to calculate the address. The
10035/// only part of this the compiler needs to know about is the first xword, which
10036/// contains a function pointer that must be called with the address of the
10037/// entire descriptor in "x0".
10038///
10039/// Since this descriptor may be in a different unit, in general even the
10040/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10041/// is:
10042/// adrp x0, _var@TLVPPAGE
10043/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10044/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10045/// ; the function pointer
10046/// blr x1 ; Uses descriptor address in x0
10047/// ; Address of _var is now in x0.
10048///
10049/// If the address of _var's descriptor *is* known to the linker, then it can
10050/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10051/// a slight efficiency gain.
10052SDValue
10053AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10054 SelectionDAG &DAG) const {
10055 assert(Subtarget->isTargetDarwin() &&
10056 "This function expects a Darwin target");
10057
10058 SDLoc DL(Op);
10059 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10060 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10061 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10062
10063 SDValue TLVPAddr =
10064 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10065 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10066
10067 // The first entry in the descriptor is a function pointer that we must call
10068 // to obtain the address of the variable.
10069 SDValue Chain = DAG.getEntryNode();
10070 SDValue FuncTLVGet = DAG.getLoad(
10071 PtrMemVT, DL, Chain, DescAddr,
10073 Align(PtrMemVT.getSizeInBits() / 8),
10075 Chain = FuncTLVGet.getValue(1);
10076
10077 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10078 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10079
10081 MFI.setAdjustsStack(true);
10082
10083 // TLS calls preserve all registers except those that absolutely must be
10084 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10085 // silly).
10086 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10087 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10088 if (Subtarget->hasCustomCallingConv())
10089 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10090
10091 // Finally, we can make the call. This is just a degenerate version of a
10092 // normal AArch64 call node: x0 takes the address of the descriptor, and
10093 // returns the address of the variable in this thread.
10094 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10095
10096 unsigned Opcode = AArch64ISD::CALL;
10098 Ops.push_back(Chain);
10099 Ops.push_back(FuncTLVGet);
10100
10101 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10102 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10103 Opcode = AArch64ISD::AUTH_CALL;
10104 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10105 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10106 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10107 }
10108
10109 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10110 Ops.push_back(DAG.getRegisterMask(Mask));
10111 Ops.push_back(Chain.getValue(1));
10112 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10113 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10114}
10115
10116/// Convert a thread-local variable reference into a sequence of instructions to
10117/// compute the variable's address for the local exec TLS model of ELF targets.
10118/// The sequence depends on the maximum TLS area size.
10119SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10120 SDValue ThreadBase,
10121 const SDLoc &DL,
10122 SelectionDAG &DAG) const {
10123 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10124 SDValue TPOff, Addr;
10125
10126 switch (DAG.getTarget().Options.TLSSize) {
10127 default:
10128 llvm_unreachable("Unexpected TLS size");
10129
10130 case 12: {
10131 // mrs x0, TPIDR_EL0
10132 // add x0, x0, :tprel_lo12:a
10134 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10135 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10136 Var,
10137 DAG.getTargetConstant(0, DL, MVT::i32)),
10138 0);
10139 }
10140
10141 case 24: {
10142 // mrs x0, TPIDR_EL0
10143 // add x0, x0, :tprel_hi12:a
10144 // add x0, x0, :tprel_lo12_nc:a
10145 SDValue HiVar = DAG.getTargetGlobalAddress(
10146 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10147 SDValue LoVar = DAG.getTargetGlobalAddress(
10148 GV, DL, PtrVT, 0,
10150 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10151 HiVar,
10152 DAG.getTargetConstant(0, DL, MVT::i32)),
10153 0);
10154 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10155 LoVar,
10156 DAG.getTargetConstant(0, DL, MVT::i32)),
10157 0);
10158 }
10159
10160 case 32: {
10161 // mrs x1, TPIDR_EL0
10162 // movz x0, #:tprel_g1:a
10163 // movk x0, #:tprel_g0_nc:a
10164 // add x0, x1, x0
10165 SDValue HiVar = DAG.getTargetGlobalAddress(
10166 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10167 SDValue LoVar = DAG.getTargetGlobalAddress(
10168 GV, DL, PtrVT, 0,
10170 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10171 DAG.getTargetConstant(16, DL, MVT::i32)),
10172 0);
10173 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10174 DAG.getTargetConstant(0, DL, MVT::i32)),
10175 0);
10176 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10177 }
10178
10179 case 48: {
10180 // mrs x1, TPIDR_EL0
10181 // movz x0, #:tprel_g2:a
10182 // movk x0, #:tprel_g1_nc:a
10183 // movk x0, #:tprel_g0_nc:a
10184 // add x0, x1, x0
10185 SDValue HiVar = DAG.getTargetGlobalAddress(
10186 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10187 SDValue MiVar = DAG.getTargetGlobalAddress(
10188 GV, DL, PtrVT, 0,
10190 SDValue LoVar = DAG.getTargetGlobalAddress(
10191 GV, DL, PtrVT, 0,
10193 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10194 DAG.getTargetConstant(32, DL, MVT::i32)),
10195 0);
10196 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10197 DAG.getTargetConstant(16, DL, MVT::i32)),
10198 0);
10199 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10200 DAG.getTargetConstant(0, DL, MVT::i32)),
10201 0);
10202 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10203 }
10204 }
10205}
10206
10207/// When accessing thread-local variables under either the general-dynamic or
10208/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10209/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10210/// is a function pointer to carry out the resolution.
10211///
10212/// The sequence is:
10213/// adrp x0, :tlsdesc:var
10214/// ldr x1, [x0, #:tlsdesc_lo12:var]
10215/// add x0, x0, #:tlsdesc_lo12:var
10216/// .tlsdesccall var
10217/// blr x1
10218/// (TPIDR_EL0 offset now in x0)
10219///
10220/// The above sequence must be produced unscheduled, to enable the linker to
10221/// optimize/relax this sequence.
10222/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10223/// above sequence, and expanded really late in the compilation flow, to ensure
10224/// the sequence is produced as per above.
10225SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10226 const SDLoc &DL,
10227 SelectionDAG &DAG) const {
10228 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10229
10230 SDValue Chain = DAG.getEntryNode();
10231 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10232
10233 unsigned Opcode =
10234 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10237 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10238 SDValue Glue = Chain.getValue(1);
10239
10240 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10241}
10242
10243SDValue
10244AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10245 SelectionDAG &DAG) const {
10246 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10247
10248 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10249 AArch64FunctionInfo *MFI =
10251
10255
10257 if (Model == TLSModel::LocalDynamic)
10259 }
10260
10262 Model != TLSModel::LocalExec)
10263 report_fatal_error("ELF TLS only supported in small memory model or "
10264 "in local exec TLS model");
10265 // Different choices can be made for the maximum size of the TLS area for a
10266 // module. For the small address model, the default TLS size is 16MiB and the
10267 // maximum TLS size is 4GiB.
10268 // FIXME: add tiny and large code model support for TLS access models other
10269 // than local exec. We currently generate the same code as small for tiny,
10270 // which may be larger than needed.
10271
10272 SDValue TPOff;
10273 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10274 SDLoc DL(Op);
10275 const GlobalValue *GV = GA->getGlobal();
10276
10277 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10278
10279 if (Model == TLSModel::LocalExec) {
10280 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10281 } else if (Model == TLSModel::InitialExec) {
10282 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10283 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10284 } else if (Model == TLSModel::LocalDynamic) {
10285 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10286 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10287 // the beginning of the module's TLS region, followed by a DTPREL offset
10288 // calculation.
10289
10290 // These accesses will need deduplicating if there's more than one.
10292
10293 // The call needs a relocation too for linker relaxation. It doesn't make
10294 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10295 // the address.
10296 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10298
10299 // Now we can calculate the offset from TPIDR_EL0 to this module's
10300 // thread-local area.
10301 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10302
10303 // Now use :dtprel_whatever: operations to calculate this variable's offset
10304 // in its thread-storage area.
10305 SDValue HiVar = DAG.getTargetGlobalAddress(
10306 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10307 SDValue LoVar = DAG.getTargetGlobalAddress(
10308 GV, DL, MVT::i64, 0,
10310
10311 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10312 DAG.getTargetConstant(0, DL, MVT::i32)),
10313 0);
10314 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10315 DAG.getTargetConstant(0, DL, MVT::i32)),
10316 0);
10317 } else if (Model == TLSModel::GeneralDynamic) {
10318 // The call needs a relocation too for linker relaxation. It doesn't make
10319 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10320 // the address.
10321 SDValue SymAddr =
10322 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10323
10324 // Finally we can make a call to calculate the offset from tpidr_el0.
10325 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10326 } else
10327 llvm_unreachable("Unsupported ELF TLS access model");
10328
10329 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10330}
10331
10332SDValue
10333AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10334 SelectionDAG &DAG) const {
10335 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10336
10337 SDValue Chain = DAG.getEntryNode();
10338 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10339 SDLoc DL(Op);
10340
10341 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10342
10343 // Load the ThreadLocalStoragePointer from the TEB
10344 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10345 SDValue TLSArray =
10346 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10347 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10348 Chain = TLSArray.getValue(1);
10349
10350 // Load the TLS index from the C runtime;
10351 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10352 // This also does the same as LOADgot, but using a generic i32 load,
10353 // while LOADgot only loads i64.
10354 SDValue TLSIndexHi =
10355 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10356 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10357 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10358 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10359 SDValue TLSIndex =
10360 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10361 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10362 Chain = TLSIndex.getValue(1);
10363
10364 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10365 // offset into the TLSArray.
10366 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10367 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10368 DAG.getConstant(3, DL, PtrVT));
10369 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10370 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10372 Chain = TLS.getValue(1);
10373
10374 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10375 const GlobalValue *GV = GA->getGlobal();
10376 SDValue TGAHi = DAG.getTargetGlobalAddress(
10377 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10378 SDValue TGALo = DAG.getTargetGlobalAddress(
10379 GV, DL, PtrVT, 0,
10381
10382 // Add the offset from the start of the .tls section (section base).
10383 SDValue Addr =
10384 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10385 DAG.getTargetConstant(0, DL, MVT::i32)),
10386 0);
10387 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10388 return Addr;
10389}
10390
10391SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10392 SelectionDAG &DAG) const {
10393 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10394 if (DAG.getTarget().useEmulatedTLS())
10395 return LowerToTLSEmulatedModel(GA, DAG);
10396
10397 if (Subtarget->isTargetDarwin())
10398 return LowerDarwinGlobalTLSAddress(Op, DAG);
10399 if (Subtarget->isTargetELF())
10400 return LowerELFGlobalTLSAddress(Op, DAG);
10401 if (Subtarget->isTargetWindows())
10402 return LowerWindowsGlobalTLSAddress(Op, DAG);
10403
10404 llvm_unreachable("Unexpected platform trying to use TLS");
10405}
10406
10407//===----------------------------------------------------------------------===//
10408// PtrAuthGlobalAddress lowering
10409//
10410// We have 3 lowering alternatives to choose from:
10411// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10412// If the GV doesn't need a GOT load (i.e., is locally defined)
10413// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10414//
10415// - LOADgotPAC: similar to LOADgot, with added PAC.
10416// If the GV needs a GOT load, materialize the pointer using the usual
10417// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10418// section is assumed to be read-only (for example, via relro mechanism). See
10419// LowerMOVaddrPAC.
10420//
10421// - LOADauthptrstatic: similar to LOADgot, but use a
10422// special stub slot instead of a GOT slot.
10423// Load a signed pointer for symbol 'sym' from a stub slot named
10424// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10425// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10426// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10427//
10428// All 3 are pseudos that are expand late to longer sequences: this lets us
10429// provide integrity guarantees on the to-be-signed intermediate values.
10430//
10431// LOADauthptrstatic is undesirable because it requires a large section filled
10432// with often similarly-signed pointers, making it a good harvesting target.
10433// Thus, it's only used for ptrauth references to extern_weak to avoid null
10434// checks.
10435
10437 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10438 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10439 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10440 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10441
10442 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10443 // offset alone as a pointer if the symbol wasn't available, which would
10444 // probably break null checks in users. Ptrauth complicates things further:
10445 // error out.
10446 if (TGN->getOffset() != 0)
10448 "unsupported non-zero offset in weak ptrauth global reference");
10449
10450 if (!isNullConstant(AddrDiscriminator))
10451 report_fatal_error("unsupported weak addr-div ptrauth global");
10452
10453 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10454 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10455 {TGA, Key, Discriminator}),
10456 0);
10457}
10458
10459SDValue
10460AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10461 SelectionDAG &DAG) const {
10462 SDValue Ptr = Op.getOperand(0);
10463 uint64_t KeyC = Op.getConstantOperandVal(1);
10464 SDValue AddrDiscriminator = Op.getOperand(2);
10465 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10466 EVT VT = Op.getValueType();
10467 SDLoc DL(Op);
10468
10469 if (KeyC > AArch64PACKey::LAST)
10470 report_fatal_error("key in ptrauth global out of range [0, " +
10471 Twine((int)AArch64PACKey::LAST) + "]");
10472
10473 // Blend only works if the integer discriminator is 16-bit wide.
10474 if (!isUInt<16>(DiscriminatorC))
10476 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10477
10478 // Choosing between 3 lowering alternatives is target-specific.
10479 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10480 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10481
10482 int64_t PtrOffsetC = 0;
10483 if (Ptr.getOpcode() == ISD::ADD) {
10484 PtrOffsetC = Ptr.getConstantOperandVal(1);
10485 Ptr = Ptr.getOperand(0);
10486 }
10487 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10488 const GlobalValue *PtrGV = PtrN->getGlobal();
10489
10490 // Classify the reference to determine whether it needs a GOT load.
10491 const unsigned OpFlags =
10492 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10493 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10494 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10495 "unsupported non-GOT op flags on ptrauth global reference");
10496
10497 // Fold any offset into the GV; our pseudos expect it there.
10498 PtrOffsetC += PtrN->getOffset();
10499 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10500 /*TargetFlags=*/0);
10501 assert(PtrN->getTargetFlags() == 0 &&
10502 "unsupported target flags on ptrauth global");
10503
10504 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10505 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10506 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10507 ? AddrDiscriminator
10508 : DAG.getRegister(AArch64::XZR, MVT::i64);
10509
10510 // No GOT load needed -> MOVaddrPAC
10511 if (!NeedsGOTLoad) {
10512 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10513 return SDValue(
10514 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10515 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10516 0);
10517 }
10518
10519 // GOT load -> LOADgotPAC
10520 // Note that we disallow extern_weak refs to avoid null checks later.
10521 if (!PtrGV->hasExternalWeakLinkage())
10522 return SDValue(
10523 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10524 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10525 0);
10526
10527 // extern_weak ref -> LOADauthptrstatic
10529 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10530 DAG);
10531}
10532
10533// Looks through \param Val to determine the bit that can be used to
10534// check the sign of the value. It returns the unextended value and
10535// the sign bit position.
10536std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10537 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10538 return {Val.getOperand(0),
10539 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10540 1};
10541
10542 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10543 return {Val.getOperand(0),
10544 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10545
10546 return {Val, Val.getValueSizeInBits() - 1};
10547}
10548
10549SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10550 SDValue Chain = Op.getOperand(0);
10551 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10552 SDValue LHS = Op.getOperand(2);
10553 SDValue RHS = Op.getOperand(3);
10554 SDValue Dest = Op.getOperand(4);
10555 SDLoc dl(Op);
10556
10558 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10559 // will not be produced, as they are conditional branch instructions that do
10560 // not set flags.
10561 bool ProduceNonFlagSettingCondBr =
10562 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10563
10564 // Handle f128 first, since lowering it will result in comparing the return
10565 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10566 // is expecting to deal with.
10567 if (LHS.getValueType() == MVT::f128) {
10568 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10569
10570 // If softenSetCCOperands returned a scalar, we need to compare the result
10571 // against zero to select between true and false values.
10572 if (!RHS.getNode()) {
10573 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10574 CC = ISD::SETNE;
10575 }
10576 }
10577
10578 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10579 // instruction.
10580 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
10581 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10582 // Only lower legal XALUO ops.
10583 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10584 return SDValue();
10585
10586 // The actual operation with overflow check.
10588 SDValue Value, Overflow;
10589 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10590
10591 if (CC == ISD::SETNE)
10592 OFCC = getInvertedCondCode(OFCC);
10593 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
10594
10595 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10596 Overflow);
10597 }
10598
10599 if (LHS.getValueType().isInteger()) {
10600 assert((LHS.getValueType() == RHS.getValueType()) &&
10601 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10602
10603 // If the RHS of the comparison is zero, we can potentially fold this
10604 // to a specialized branch.
10605 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10606 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10607 if (CC == ISD::SETEQ) {
10608 // See if we can use a TBZ to fold in an AND as well.
10609 // TBZ has a smaller branch displacement than CBZ. If the offset is
10610 // out of bounds, a late MI-layer pass rewrites branches.
10611 // 403.gcc is an example that hits this case.
10612 if (LHS.getOpcode() == ISD::AND &&
10613 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10614 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10615 SDValue Test = LHS.getOperand(0);
10616 uint64_t Mask = LHS.getConstantOperandVal(1);
10617 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
10618 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10619 Dest);
10620 }
10621
10622 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
10623 } else if (CC == ISD::SETNE) {
10624 // See if we can use a TBZ to fold in an AND as well.
10625 // TBZ has a smaller branch displacement than CBZ. If the offset is
10626 // out of bounds, a late MI-layer pass rewrites branches.
10627 // 403.gcc is an example that hits this case.
10628 if (LHS.getOpcode() == ISD::AND &&
10629 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10630 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10631 SDValue Test = LHS.getOperand(0);
10632 uint64_t Mask = LHS.getConstantOperandVal(1);
10633 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
10634 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10635 Dest);
10636 }
10637
10638 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
10639 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10640 // Don't combine AND since emitComparison converts the AND to an ANDS
10641 // (a.k.a. TST) and the test in the test bit and branch instruction
10642 // becomes redundant. This would also increase register pressure.
10643 uint64_t SignBitPos;
10644 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10645 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
10646 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10647 }
10648 }
10649 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10650 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10651 // Don't combine AND since emitComparison converts the AND to an ANDS
10652 // (a.k.a. TST) and the test in the test bit and branch instruction
10653 // becomes redundant. This would also increase register pressure.
10654 uint64_t SignBitPos;
10655 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10656 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
10657 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10658 }
10659
10660 SDValue CCVal;
10661 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10662 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10663 Cmp);
10664 }
10665
10666 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10667 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10668
10669 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10670 // clean. Some of them require two branches to implement.
10671 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10672 AArch64CC::CondCode CC1, CC2;
10673 changeFPCCToAArch64CC(CC, CC1, CC2);
10674 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10675 SDValue BR1 =
10676 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
10677 if (CC2 != AArch64CC::AL) {
10678 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10679 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
10680 Cmp);
10681 }
10682
10683 return BR1;
10684}
10685
10686SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10687 SelectionDAG &DAG) const {
10688 if (!Subtarget->isNeonAvailable() &&
10689 !Subtarget->useSVEForFixedLengthVectors())
10690 return SDValue();
10691
10692 EVT VT = Op.getValueType();
10693 EVT IntVT = VT.changeTypeToInteger();
10694 SDLoc DL(Op);
10695
10696 SDValue In1 = Op.getOperand(0);
10697 SDValue In2 = Op.getOperand(1);
10698 EVT SrcVT = In2.getValueType();
10699
10700 if (!SrcVT.bitsEq(VT))
10701 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
10702
10703 if (VT.isScalableVector())
10704 IntVT =
10706
10707 if (VT.isFixedLengthVector() &&
10708 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
10709 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10710
10711 In1 = convertToScalableVector(DAG, ContainerVT, In1);
10712 In2 = convertToScalableVector(DAG, ContainerVT, In2);
10713
10714 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
10715 return convertFromScalableVector(DAG, VT, Res);
10716 }
10717
10718 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10719 if (VT.isScalableVector())
10720 return getSVESafeBitCast(VT, Op, DAG);
10721
10722 return DAG.getBitcast(VT, Op);
10723 };
10724
10725 SDValue VecVal1, VecVal2;
10726 EVT VecVT;
10727 auto SetVecVal = [&](int Idx = -1) {
10728 if (!VT.isVector()) {
10729 VecVal1 =
10730 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
10731 VecVal2 =
10732 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
10733 } else {
10734 VecVal1 = BitCast(VecVT, In1, DAG);
10735 VecVal2 = BitCast(VecVT, In2, DAG);
10736 }
10737 };
10738 if (VT.isVector()) {
10739 VecVT = IntVT;
10740 SetVecVal();
10741 } else if (VT == MVT::f64) {
10742 VecVT = MVT::v2i64;
10743 SetVecVal(AArch64::dsub);
10744 } else if (VT == MVT::f32) {
10745 VecVT = MVT::v4i32;
10746 SetVecVal(AArch64::ssub);
10747 } else if (VT == MVT::f16 || VT == MVT::bf16) {
10748 VecVT = MVT::v8i16;
10749 SetVecVal(AArch64::hsub);
10750 } else {
10751 llvm_unreachable("Invalid type for copysign!");
10752 }
10753
10754 unsigned BitWidth = In1.getScalarValueSizeInBits();
10755 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10756
10757 // We want to materialize a mask with every bit but the high bit set, but the
10758 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10759 // 64-bit elements. Instead, materialize all bits set and then negate that.
10760 if (VT == MVT::f64 || VT == MVT::v2f64) {
10761 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10762 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10763 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10764 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10765 }
10766
10767 SDValue BSP =
10768 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
10769 if (VT == MVT::f16 || VT == MVT::bf16)
10770 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
10771 if (VT == MVT::f32)
10772 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
10773 if (VT == MVT::f64)
10774 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
10775
10776 return BitCast(VT, BSP, DAG);
10777}
10778
10779SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10780 SelectionDAG &DAG) const {
10782 Attribute::NoImplicitFloat))
10783 return SDValue();
10784
10785 EVT VT = Op.getValueType();
10786 if (VT.isScalableVector() ||
10788 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
10789
10790 if (!Subtarget->isNeonAvailable())
10791 return SDValue();
10792
10793 bool IsParity = Op.getOpcode() == ISD::PARITY;
10794 SDValue Val = Op.getOperand(0);
10795 SDLoc DL(Op);
10796
10797 // for i32, general parity function using EORs is more efficient compared to
10798 // using floating point
10799 if (VT == MVT::i32 && IsParity)
10800 return SDValue();
10801
10802 // If there is no CNT instruction available, GPR popcount can
10803 // be more efficiently lowered to the following sequence that uses
10804 // AdvSIMD registers/instructions as long as the copies to/from
10805 // the AdvSIMD registers are cheap.
10806 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10807 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10808 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10809 // FMOV X0, D0 // copy result back to integer reg
10810 if (VT == MVT::i32 || VT == MVT::i64) {
10811 if (VT == MVT::i32)
10812 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
10813 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
10814
10815 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10816 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10817 if (VT == MVT::i32)
10818 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
10819 DAG.getConstant(0, DL, MVT::i64));
10820 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10821 if (IsParity)
10822 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10823 return AddV;
10824 } else if (VT == MVT::i128) {
10825 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
10826
10827 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10828 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10829 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10830 if (IsParity)
10831 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10832 return AddV;
10833 }
10834
10835 assert(!IsParity && "ISD::PARITY of vector types not supported");
10836
10837 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10838 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10839 "Unexpected type for custom ctpop lowering");
10840
10841 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10842 Val = DAG.getBitcast(VT8Bit, Val);
10843 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
10844
10845 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10846 VT.getVectorNumElements() >= 2) {
10847 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10848 SDValue Zeros = DAG.getConstant(0, DL, DT);
10849 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
10850
10851 if (VT == MVT::v2i64) {
10852 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10853 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
10854 } else if (VT == MVT::v2i32) {
10855 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10856 } else if (VT == MVT::v4i32) {
10857 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10858 } else {
10859 llvm_unreachable("Unexpected type for custom ctpop lowering");
10860 }
10861
10862 return Val;
10863 }
10864
10865 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10866 unsigned EltSize = 8;
10867 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10868 while (EltSize != VT.getScalarSizeInBits()) {
10869 EltSize *= 2;
10870 NumElts /= 2;
10871 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
10872 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
10873 }
10874
10875 return Val;
10876}
10877
10878SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10879 EVT VT = Op.getValueType();
10880 assert(VT.isScalableVector() ||
10882 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10883
10884 SDLoc DL(Op);
10885 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
10886 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
10887}
10888
10889SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10890 SelectionDAG &DAG) const {
10891
10892 EVT VT = Op.getValueType();
10893 SDLoc DL(Op);
10894 unsigned Opcode = Op.getOpcode();
10896 switch (Opcode) {
10897 default:
10898 llvm_unreachable("Wrong instruction");
10899 case ISD::SMAX:
10900 CC = ISD::SETGT;
10901 break;
10902 case ISD::SMIN:
10903 CC = ISD::SETLT;
10904 break;
10905 case ISD::UMAX:
10906 CC = ISD::SETUGT;
10907 break;
10908 case ISD::UMIN:
10909 CC = ISD::SETULT;
10910 break;
10911 }
10912
10913 if (VT.isScalableVector() ||
10915 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10916 switch (Opcode) {
10917 default:
10918 llvm_unreachable("Wrong instruction");
10919 case ISD::SMAX:
10920 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10921 case ISD::SMIN:
10922 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10923 case ISD::UMAX:
10924 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10925 case ISD::UMIN:
10926 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10927 }
10928 }
10929
10930 SDValue Op0 = Op.getOperand(0);
10931 SDValue Op1 = Op.getOperand(1);
10932 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10933 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10934}
10935
10936SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10937 SelectionDAG &DAG) const {
10938 EVT VT = Op.getValueType();
10939
10940 if (VT.isScalableVector() ||
10942 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10943 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10944
10945 SDLoc DL(Op);
10946 SDValue REVB;
10947 MVT VST;
10948
10949 switch (VT.getSimpleVT().SimpleTy) {
10950 default:
10951 llvm_unreachable("Invalid type for bitreverse!");
10952
10953 case MVT::v2i32: {
10954 VST = MVT::v8i8;
10955 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10956
10957 break;
10958 }
10959
10960 case MVT::v4i32: {
10961 VST = MVT::v16i8;
10962 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10963
10964 break;
10965 }
10966
10967 case MVT::v1i64: {
10968 VST = MVT::v8i8;
10969 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10970
10971 break;
10972 }
10973
10974 case MVT::v2i64: {
10975 VST = MVT::v16i8;
10976 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10977
10978 break;
10979 }
10980 }
10981
10982 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10983 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10984}
10985
10986// Check whether the continuous comparison sequence.
10987static bool
10988isOrXorChain(SDValue N, unsigned &Num,
10989 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10990 if (Num == MaxXors)
10991 return false;
10992
10993 // Skip the one-use zext
10994 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10995 N = N->getOperand(0);
10996
10997 // The leaf node must be XOR
10998 if (N->getOpcode() == ISD::XOR) {
10999 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11000 Num++;
11001 return true;
11002 }
11003
11004 // All the non-leaf nodes must be OR.
11005 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11006 return false;
11007
11008 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11009 isOrXorChain(N->getOperand(1), Num, WorkList))
11010 return true;
11011 return false;
11012}
11013
11014// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11016 SDValue LHS = N->getOperand(0);
11017 SDValue RHS = N->getOperand(1);
11018 SDLoc DL(N);
11019 EVT VT = N->getValueType(0);
11021
11022 // Only handle integer compares.
11023 if (N->getOpcode() != ISD::SETCC)
11024 return SDValue();
11025
11026 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11027 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11028 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11029 unsigned NumXors = 0;
11030 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11031 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11032 isOrXorChain(LHS, NumXors, WorkList)) {
11033 SDValue XOR0, XOR1;
11034 std::tie(XOR0, XOR1) = WorkList[0];
11035 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11036 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11037 for (unsigned I = 1; I < WorkList.size(); I++) {
11038 std::tie(XOR0, XOR1) = WorkList[I];
11039 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11040 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11041 }
11042
11043 // Exit early by inverting the condition, which help reduce indentations.
11044 return Cmp;
11045 }
11046
11047 return SDValue();
11048}
11049
11050SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11051
11052 if (Op.getValueType().isVector())
11053 return LowerVSETCC(Op, DAG);
11054
11055 bool IsStrict = Op->isStrictFPOpcode();
11056 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11057 unsigned OpNo = IsStrict ? 1 : 0;
11058 SDValue Chain;
11059 if (IsStrict)
11060 Chain = Op.getOperand(0);
11061 SDValue LHS = Op.getOperand(OpNo + 0);
11062 SDValue RHS = Op.getOperand(OpNo + 1);
11063 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11064 SDLoc dl(Op);
11065
11066 // We chose ZeroOrOneBooleanContents, so use zero and one.
11067 EVT VT = Op.getValueType();
11068 SDValue TVal = DAG.getConstant(1, dl, VT);
11069 SDValue FVal = DAG.getConstant(0, dl, VT);
11070
11071 // Handle f128 first, since one possible outcome is a normal integer
11072 // comparison which gets picked up by the next if statement.
11073 if (LHS.getValueType() == MVT::f128) {
11074 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
11075 IsSignaling);
11076
11077 // If softenSetCCOperands returned a scalar, use it.
11078 if (!RHS.getNode()) {
11079 assert(LHS.getValueType() == Op.getValueType() &&
11080 "Unexpected setcc expansion!");
11081 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
11082 }
11083 }
11084
11085 if (LHS.getValueType().isInteger()) {
11086
11087 simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
11088
11089 SDValue CCVal;
11091 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
11092
11093 // Note that we inverted the condition above, so we reverse the order of
11094 // the true and false operands here. This will allow the setcc to be
11095 // matched to a single CSINC instruction.
11096 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
11097 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
11098 }
11099
11100 // Now we know we're dealing with FP values.
11101 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11102 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11103
11104 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11105 // and do the comparison.
11106 SDValue Cmp;
11107 if (IsStrict)
11108 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
11109 else
11110 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11111
11112 AArch64CC::CondCode CC1, CC2;
11113 changeFPCCToAArch64CC(CC, CC1, CC2);
11114 SDValue Res;
11115 if (CC2 == AArch64CC::AL) {
11116 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11117 CC2);
11118 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11119
11120 // Note that we inverted the condition above, so we reverse the order of
11121 // the true and false operands here. This will allow the setcc to be
11122 // matched to a single CSINC instruction.
11123 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
11124 } else {
11125 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11126 // totally clean. Some of them require two CSELs to implement. As is in
11127 // this case, we emit the first CSEL and then emit a second using the output
11128 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11129
11130 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11131 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11132 SDValue CS1 =
11133 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11134
11135 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11136 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11137 }
11138 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
11139}
11140
11141SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11142 SelectionDAG &DAG) const {
11143
11144 SDValue LHS = Op.getOperand(0);
11145 SDValue RHS = Op.getOperand(1);
11146 EVT VT = LHS.getValueType();
11147 if (VT != MVT::i32 && VT != MVT::i64)
11148 return SDValue();
11149
11150 SDLoc DL(Op);
11151 SDValue Carry = Op.getOperand(2);
11152 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11153 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11154 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
11155 LHS, RHS, InvCarry);
11156
11157 EVT OpVT = Op.getValueType();
11158 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11159 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11160
11161 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11163 SDValue CCVal =
11164 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
11165 // Inputs are swapped because the condition is inverted. This will allow
11166 // matching with a single CSINC instruction.
11167 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11168 Cmp.getValue(1));
11169}
11170
11171SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
11172 SDValue RHS, SDValue TVal,
11173 SDValue FVal, const SDLoc &dl,
11174 SelectionDAG &DAG) const {
11175 // Handle f128 first, because it will result in a comparison of some RTLIB
11176 // call result against zero.
11177 if (LHS.getValueType() == MVT::f128) {
11178 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
11179
11180 // If softenSetCCOperands returned a scalar, we need to compare the result
11181 // against zero to select between true and false values.
11182 if (!RHS.getNode()) {
11183 RHS = DAG.getConstant(0, dl, LHS.getValueType());
11184 CC = ISD::SETNE;
11185 }
11186 }
11187
11188 // Also handle f16, for which we need to do a f32 comparison.
11189 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11190 LHS.getValueType() == MVT::bf16) {
11191 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
11192 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
11193 }
11194
11195 // Next, handle integers.
11196 if (LHS.getValueType().isInteger()) {
11197 assert((LHS.getValueType() == RHS.getValueType()) &&
11198 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11199
11200 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11201 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11202 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11203 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
11204 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
11205 // supported types.
11206 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
11207 CTVal->isOne() && CFVal->isAllOnes() &&
11208 LHS.getValueType() == TVal.getValueType()) {
11209 EVT VT = LHS.getValueType();
11210 SDValue Shift =
11211 DAG.getNode(ISD::SRA, dl, VT, LHS,
11212 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11213 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
11214 }
11215
11216 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11217 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11218 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11219 // Both require less instructions than compare and conditional select.
11220 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11221 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11222 LHS.getValueType() == RHS.getValueType()) {
11223 EVT VT = LHS.getValueType();
11224 SDValue Shift =
11225 DAG.getNode(ISD::SRA, dl, VT, LHS,
11226 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11227
11228 if (CC == ISD::SETGT)
11229 Shift = DAG.getNOT(dl, Shift, VT);
11230
11231 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
11232 }
11233
11234 unsigned Opcode = AArch64ISD::CSEL;
11235
11236 // If both the TVal and the FVal are constants, see if we can swap them in
11237 // order to for a CSINV or CSINC out of them.
11238 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11239 std::swap(TVal, FVal);
11240 std::swap(CTVal, CFVal);
11241 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11242 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11243 std::swap(TVal, FVal);
11244 std::swap(CTVal, CFVal);
11245 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11246 } else if (TVal.getOpcode() == ISD::XOR) {
11247 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11248 // with a CSINV rather than a CSEL.
11249 if (isAllOnesConstant(TVal.getOperand(1))) {
11250 std::swap(TVal, FVal);
11251 std::swap(CTVal, CFVal);
11252 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11253 }
11254 } else if (TVal.getOpcode() == ISD::SUB) {
11255 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11256 // that we can match with a CSNEG rather than a CSEL.
11257 if (isNullConstant(TVal.getOperand(0))) {
11258 std::swap(TVal, FVal);
11259 std::swap(CTVal, CFVal);
11260 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11261 }
11262 } else if (CTVal && CFVal) {
11263 const int64_t TrueVal = CTVal->getSExtValue();
11264 const int64_t FalseVal = CFVal->getSExtValue();
11265 bool Swap = false;
11266
11267 // If both TVal and FVal are constants, see if FVal is the
11268 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11269 // instead of a CSEL in that case.
11270 if (TrueVal == ~FalseVal) {
11271 Opcode = AArch64ISD::CSINV;
11272 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11273 TrueVal == -FalseVal) {
11274 Opcode = AArch64ISD::CSNEG;
11275 } else if (TVal.getValueType() == MVT::i32) {
11276 // If our operands are only 32-bit wide, make sure we use 32-bit
11277 // arithmetic for the check whether we can use CSINC. This ensures that
11278 // the addition in the check will wrap around properly in case there is
11279 // an overflow (which would not be the case if we do the check with
11280 // 64-bit arithmetic).
11281 const uint32_t TrueVal32 = CTVal->getZExtValue();
11282 const uint32_t FalseVal32 = CFVal->getZExtValue();
11283
11284 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11285 Opcode = AArch64ISD::CSINC;
11286
11287 if (TrueVal32 > FalseVal32) {
11288 Swap = true;
11289 }
11290 }
11291 } else {
11292 // 64-bit check whether we can use CSINC.
11293 const uint64_t TrueVal64 = TrueVal;
11294 const uint64_t FalseVal64 = FalseVal;
11295
11296 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11297 Opcode = AArch64ISD::CSINC;
11298
11299 if (TrueVal > FalseVal) {
11300 Swap = true;
11301 }
11302 }
11303 }
11304
11305 // Swap TVal and FVal if necessary.
11306 if (Swap) {
11307 std::swap(TVal, FVal);
11308 std::swap(CTVal, CFVal);
11309 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11310 }
11311
11312 if (Opcode != AArch64ISD::CSEL) {
11313 // Drop FVal since we can get its value by simply inverting/negating
11314 // TVal.
11315 FVal = TVal;
11316 }
11317 }
11318
11319 // Avoid materializing a constant when possible by reusing a known value in
11320 // a register. However, don't perform this optimization if the known value
11321 // is one, zero or negative one in the case of a CSEL. We can always
11322 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11323 // FVal, respectively.
11324 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11325 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11326 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11328 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11329 // "a != C ? x : a" to avoid materializing C.
11330 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11331 TVal = LHS;
11332 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11333 FVal = LHS;
11334 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11335 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11336 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11337 // avoid materializing C.
11339 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11340 Opcode = AArch64ISD::CSINV;
11341 TVal = LHS;
11342 FVal = DAG.getConstant(0, dl, FVal.getValueType());
11343 }
11344 }
11345
11346 SDValue CCVal;
11347 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
11348 EVT VT = TVal.getValueType();
11349 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
11350 }
11351
11352 // Now we know we're dealing with FP values.
11353 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11354 LHS.getValueType() == MVT::f64);
11355 assert(LHS.getValueType() == RHS.getValueType());
11356 EVT VT = TVal.getValueType();
11357 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11358
11359 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11360 // clean. Some of them require two CSELs to implement.
11361 AArch64CC::CondCode CC1, CC2;
11362 changeFPCCToAArch64CC(CC, CC1, CC2);
11363
11364 if (DAG.getTarget().Options.UnsafeFPMath) {
11365 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11366 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11367 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11368 if (RHSVal && RHSVal->isZero()) {
11369 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11370 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11371
11372 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11373 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11374 TVal = LHS;
11375 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11376 CFVal && CFVal->isZero() &&
11377 FVal.getValueType() == LHS.getValueType())
11378 FVal = LHS;
11379 }
11380 }
11381
11382 // Emit first, and possibly only, CSEL.
11383 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11384 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11385
11386 // If we need a second CSEL, emit it, using the output of the first as the
11387 // RHS. We're effectively OR'ing the two CC's together.
11388 if (CC2 != AArch64CC::AL) {
11389 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11390 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11391 }
11392
11393 // Otherwise, return the output of the first CSEL.
11394 return CS1;
11395}
11396
11397SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11398 SelectionDAG &DAG) const {
11399 EVT Ty = Op.getValueType();
11400 auto Idx = Op.getConstantOperandAPInt(2);
11401 int64_t IdxVal = Idx.getSExtValue();
11402 assert(Ty.isScalableVector() &&
11403 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11404
11405 // We can use the splice instruction for certain index values where we are
11406 // able to efficiently generate the correct predicate. The index will be
11407 // inverted and used directly as the input to the ptrue instruction, i.e.
11408 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11409 // splice predicate. However, we can only do this if we can guarantee that
11410 // there are enough elements in the vector, hence we check the index <= min
11411 // number of elements.
11412 std::optional<unsigned> PredPattern;
11413 if (Ty.isScalableVector() && IdxVal < 0 &&
11414 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11415 std::nullopt) {
11416 SDLoc DL(Op);
11417
11418 // Create a predicate where all but the last -IdxVal elements are false.
11419 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11420 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11421 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11422
11423 // Now splice the two inputs together using the predicate.
11424 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11425 Op.getOperand(1));
11426 }
11427
11428 // We can select to an EXT instruction when indexing the first 256 bytes.
11430 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11431 return Op;
11432
11433 return SDValue();
11434}
11435
11436SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11437 SelectionDAG &DAG) const {
11438 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11439 SDValue LHS = Op.getOperand(0);
11440 SDValue RHS = Op.getOperand(1);
11441 SDValue TVal = Op.getOperand(2);
11442 SDValue FVal = Op.getOperand(3);
11443 SDLoc DL(Op);
11444 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11445}
11446
11447SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11448 SelectionDAG &DAG) const {
11449 SDValue CCVal = Op->getOperand(0);
11450 SDValue TVal = Op->getOperand(1);
11451 SDValue FVal = Op->getOperand(2);
11452 SDLoc DL(Op);
11453
11454 EVT Ty = Op.getValueType();
11455 if (Ty == MVT::aarch64svcount) {
11456 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
11457 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
11458 SDValue Sel =
11459 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
11460 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
11461 }
11462
11463 if (Ty.isScalableVector()) {
11464 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
11465 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
11466 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11467 }
11468
11469 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
11470 // FIXME: Ideally this would be the same as above using i1 types, however
11471 // for the moment we can't deal with fixed i1 vector types properly, so
11472 // instead extend the predicate to a result type sized integer vector.
11473 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
11474 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
11475 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
11476 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
11477 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11478 }
11479
11480 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11481 // instruction.
11482 if (ISD::isOverflowIntrOpRes(CCVal)) {
11483 // Only lower legal XALUO ops.
11484 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
11485 return SDValue();
11486
11488 SDValue Value, Overflow;
11489 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
11490 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
11491
11492 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
11493 CCVal, Overflow);
11494 }
11495
11496 // Lower it the same way as we would lower a SELECT_CC node.
11498 SDValue LHS, RHS;
11499 if (CCVal.getOpcode() == ISD::SETCC) {
11500 LHS = CCVal.getOperand(0);
11501 RHS = CCVal.getOperand(1);
11502 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
11503 } else {
11504 LHS = CCVal;
11505 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
11506 CC = ISD::SETNE;
11507 }
11508
11509 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11510 // order to use FCSELSrrr
11511 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11512 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11513 DAG.getUNDEF(MVT::f32), TVal);
11514 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11515 DAG.getUNDEF(MVT::f32), FVal);
11516 }
11517
11518 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11519
11520 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11521 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
11522 }
11523
11524 return Res;
11525}
11526
11527SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11528 SelectionDAG &DAG) const {
11529 // Jump table entries as PC relative offsets. No additional tweaking
11530 // is necessary here. Just get the address of the jump table.
11531 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11532
11535 !Subtarget->isTargetMachO())
11536 return getAddrLarge(JT, DAG);
11537 if (CM == CodeModel::Tiny)
11538 return getAddrTiny(JT, DAG);
11539 return getAddr(JT, DAG);
11540}
11541
11542SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11543 SelectionDAG &DAG) const {
11544 // Jump table entries as PC relative offsets. No additional tweaking
11545 // is necessary here. Just get the address of the jump table.
11546 SDLoc DL(Op);
11547 SDValue JT = Op.getOperand(1);
11548 SDValue Entry = Op.getOperand(2);
11549 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
11550
11551 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11552 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
11553
11554 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11555 // sequence later, to guarantee the integrity of the intermediate values.
11557 "aarch64-jump-table-hardening")) {
11559 if (Subtarget->isTargetMachO()) {
11560 if (CM != CodeModel::Small && CM != CodeModel::Large)
11561 report_fatal_error("Unsupported code-model for hardened jump-table");
11562 } else {
11563 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11564 assert(Subtarget->isTargetELF() &&
11565 "jump table hardening only supported on MachO/ELF");
11566 if (CM != CodeModel::Small)
11567 report_fatal_error("Unsupported code-model for hardened jump-table");
11568 }
11569
11570 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
11571 Entry, SDValue());
11572 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
11573 DAG.getTargetJumpTable(JTI, MVT::i32),
11574 X16Copy.getValue(0), X16Copy.getValue(1));
11575 return SDValue(B, 0);
11576 }
11577
11578 SDNode *Dest =
11579 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
11580 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
11581 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
11582 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
11583}
11584
11585SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11586 SDValue Chain = Op.getOperand(0);
11587 SDValue Dest = Op.getOperand(1);
11588
11589 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11590 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11591 if (Dest->isMachineOpcode() &&
11592 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
11593 return SDValue();
11594
11595 const MachineFunction &MF = DAG.getMachineFunction();
11596 std::optional<uint16_t> BADisc =
11598 if (!BADisc)
11599 return SDValue();
11600
11601 SDLoc DL(Op);
11602
11603 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11605 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11606
11607 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
11608 {Dest, Key, Disc, AddrDisc, Chain});
11609 return SDValue(BrA, 0);
11610}
11611
11612SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11613 SelectionDAG &DAG) const {
11614 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
11616 if (CM == CodeModel::Large) {
11617 // Use the GOT for the large code model on iOS.
11618 if (Subtarget->isTargetMachO()) {
11619 return getGOT(CP, DAG);
11620 }
11622 return getAddrLarge(CP, DAG);
11623 } else if (CM == CodeModel::Tiny) {
11624 return getAddrTiny(CP, DAG);
11625 }
11626 return getAddr(CP, DAG);
11627}
11628
11629SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11630 SelectionDAG &DAG) const {
11631 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
11632 const BlockAddress *BA = BAN->getBlockAddress();
11633
11634 if (std::optional<uint16_t> BADisc =
11636 *BA->getFunction())) {
11637 SDLoc DL(Op);
11638
11639 // This isn't cheap, but BRIND is rare.
11640 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
11641
11642 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11643
11645 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11646
11647 SDNode *MOV =
11648 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
11649 {TargetBA, Key, AddrDisc, Disc});
11650 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
11651 SDValue(MOV, 1));
11652 }
11653
11655 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11657 return getAddrLarge(BAN, DAG);
11658 } else if (CM == CodeModel::Tiny) {
11659 return getAddrTiny(BAN, DAG);
11660 }
11661 return getAddr(BAN, DAG);
11662}
11663
11664SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11665 SelectionDAG &DAG) const {
11666 AArch64FunctionInfo *FuncInfo =
11668
11669 SDLoc DL(Op);
11670 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
11672 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
11673 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11674 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11675 MachinePointerInfo(SV));
11676}
11677
11678SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11679 SelectionDAG &DAG) const {
11682
11683 SDLoc DL(Op);
11684 SDValue FR;
11685 if (Subtarget->isWindowsArm64EC()) {
11686 // With the Arm64EC ABI, we compute the address of the varargs save area
11687 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11688 // but calls from an entry thunk can pass in a different address.
11689 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
11690 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
11692 if (FuncInfo->getVarArgsGPRSize() > 0)
11693 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11694 else
11695 StackOffset = FuncInfo->getVarArgsStackOffset();
11696 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
11697 DAG.getConstant(StackOffset, DL, MVT::i64));
11698 } else {
11699 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
11700 ? FuncInfo->getVarArgsGPRIndex()
11701 : FuncInfo->getVarArgsStackIndex(),
11703 }
11704 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11705 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11706 MachinePointerInfo(SV));
11707}
11708
11709SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11710 SelectionDAG &DAG) const {
11711 // The layout of the va_list struct is specified in the AArch64 Procedure Call
11712 // Standard, section B.3.
11715 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11716 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11717 auto PtrVT = getPointerTy(DAG.getDataLayout());
11718 SDLoc DL(Op);
11719
11720 SDValue Chain = Op.getOperand(0);
11721 SDValue VAList = Op.getOperand(1);
11722 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11724
11725 // void *__stack at offset 0
11726 unsigned Offset = 0;
11727 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
11728 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
11729 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
11730 MachinePointerInfo(SV), Align(PtrSize)));
11731
11732 // void *__gr_top at offset 8 (4 on ILP32)
11733 Offset += PtrSize;
11734 int GPRSize = FuncInfo->getVarArgsGPRSize();
11735 if (GPRSize > 0) {
11736 SDValue GRTop, GRTopAddr;
11737
11738 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11739 DAG.getConstant(Offset, DL, PtrVT));
11740
11741 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
11742 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
11743 DAG.getSignedConstant(GPRSize, DL, PtrVT));
11744 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
11745
11746 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
11748 Align(PtrSize)));
11749 }
11750
11751 // void *__vr_top at offset 16 (8 on ILP32)
11752 Offset += PtrSize;
11753 int FPRSize = FuncInfo->getVarArgsFPRSize();
11754 if (FPRSize > 0) {
11755 SDValue VRTop, VRTopAddr;
11756 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11757 DAG.getConstant(Offset, DL, PtrVT));
11758
11759 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
11760 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
11761 DAG.getSignedConstant(FPRSize, DL, PtrVT));
11762 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
11763
11764 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
11766 Align(PtrSize)));
11767 }
11768
11769 // int __gr_offs at offset 24 (12 on ILP32)
11770 Offset += PtrSize;
11771 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11772 DAG.getConstant(Offset, DL, PtrVT));
11773 MemOps.push_back(
11774 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
11775 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11776
11777 // int __vr_offs at offset 28 (16 on ILP32)
11778 Offset += 4;
11779 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11780 DAG.getConstant(Offset, DL, PtrVT));
11781 MemOps.push_back(
11782 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
11783 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11784
11785 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
11786}
11787
11788SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11789 SelectionDAG &DAG) const {
11791 Function &F = MF.getFunction();
11792
11793 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
11794 return LowerWin64_VASTART(Op, DAG);
11795 else if (Subtarget->isTargetDarwin())
11796 return LowerDarwin_VASTART(Op, DAG);
11797 else
11798 return LowerAAPCS_VASTART(Op, DAG);
11799}
11800
11801SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11802 SelectionDAG &DAG) const {
11803 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11804 // pointer.
11805 SDLoc DL(Op);
11806 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11807 unsigned VaListSize =
11808 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11809 ? PtrSize
11810 : Subtarget->isTargetILP32() ? 20 : 32;
11811 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
11812 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
11813
11814 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
11815 DAG.getConstant(VaListSize, DL, MVT::i32),
11816 Align(PtrSize), false, false, /*CI=*/nullptr,
11817 std::nullopt, MachinePointerInfo(DestSV),
11818 MachinePointerInfo(SrcSV));
11819}
11820
11821SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11822 assert(Subtarget->isTargetDarwin() &&
11823 "automatic va_arg instruction only works on Darwin");
11824
11825 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11826 EVT VT = Op.getValueType();
11827 SDLoc DL(Op);
11828 SDValue Chain = Op.getOperand(0);
11829 SDValue Addr = Op.getOperand(1);
11830 MaybeAlign Align(Op.getConstantOperandVal(3));
11831 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11832 auto PtrVT = getPointerTy(DAG.getDataLayout());
11833 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11834 SDValue VAList =
11835 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
11836 Chain = VAList.getValue(1);
11837 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
11838
11839 if (VT.isScalableVector())
11840 report_fatal_error("Passing SVE types to variadic functions is "
11841 "currently not supported");
11842
11843 if (Align && *Align > MinSlotSize) {
11844 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11845 DAG.getConstant(Align->value() - 1, DL, PtrVT));
11846 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
11847 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
11848 }
11849
11850 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
11851 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
11852
11853 // Scalar integer and FP values smaller than 64 bits are implicitly extended
11854 // up to 64 bits. At the very least, we have to increase the striding of the
11855 // vaargs list to match this, and for FP values we need to introduce
11856 // FP_ROUND nodes as well.
11857 if (VT.isInteger() && !VT.isVector())
11858 ArgSize = std::max(ArgSize, MinSlotSize);
11859 bool NeedFPTrunc = false;
11860 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11861 ArgSize = 8;
11862 NeedFPTrunc = true;
11863 }
11864
11865 // Increment the pointer, VAList, to the next vaarg
11866 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11867 DAG.getConstant(ArgSize, DL, PtrVT));
11868 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
11869
11870 // Store the incremented VAList to the legalized pointer
11871 SDValue APStore =
11872 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
11873
11874 // Load the actual argument out of the pointer VAList
11875 if (NeedFPTrunc) {
11876 // Load the value as an f64.
11877 SDValue WideFP =
11878 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
11879 // Round the value down to an f32.
11880 SDValue NarrowFP =
11881 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
11882 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
11883 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
11884 // Merge the rounded value with the chain output of the load.
11885 return DAG.getMergeValues(Ops, DL);
11886 }
11887
11888 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
11889}
11890
11891SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11892 SelectionDAG &DAG) const {
11894 MFI.setFrameAddressIsTaken(true);
11895
11896 EVT VT = Op.getValueType();
11897 SDLoc DL(Op);
11898 unsigned Depth = Op.getConstantOperandVal(0);
11899 SDValue FrameAddr =
11900 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
11901 while (Depth--)
11902 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
11904
11905 if (Subtarget->isTargetILP32())
11906 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
11907 DAG.getValueType(VT));
11908
11909 return FrameAddr;
11910}
11911
11912SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11913 SelectionDAG &DAG) const {
11915
11916 EVT VT = getPointerTy(DAG.getDataLayout());
11917 SDLoc DL(Op);
11918 int FI = MFI.CreateFixedObject(4, 0, false);
11919 return DAG.getFrameIndex(FI, VT);
11920}
11921
11922#define GET_REGISTER_MATCHER
11923#include "AArch64GenAsmMatcher.inc"
11924
11925// FIXME? Maybe this could be a TableGen attribute on some registers and
11926// this table could be generated automatically from RegInfo.
11927Register AArch64TargetLowering::
11928getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11930 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11931 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11932 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
11933 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
11934 !MRI->isReservedReg(MF, Reg))
11935 Reg = 0;
11936 }
11937 if (Reg)
11938 return Reg;
11939 report_fatal_error(Twine("Invalid register name \""
11940 + StringRef(RegName) + "\"."));
11941}
11942
11943SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11944 SelectionDAG &DAG) const {
11946
11947 EVT VT = Op.getValueType();
11948 SDLoc DL(Op);
11949
11950 SDValue FrameAddr =
11951 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
11953
11954 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
11955}
11956
11957SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11958 SelectionDAG &DAG) const {
11960 MachineFrameInfo &MFI = MF.getFrameInfo();
11961 MFI.setReturnAddressIsTaken(true);
11962
11963 EVT VT = Op.getValueType();
11964 SDLoc DL(Op);
11965 unsigned Depth = Op.getConstantOperandVal(0);
11966 SDValue ReturnAddress;
11967 if (Depth) {
11968 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11970 ReturnAddress = DAG.getLoad(
11971 VT, DL, DAG.getEntryNode(),
11972 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11973 } else {
11974 // Return LR, which contains the return address. Mark it an implicit
11975 // live-in.
11976 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11977 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11978 }
11979
11980 // The XPACLRI instruction assembles to a hint-space instruction before
11981 // Armv8.3-A therefore this instruction can be safely used for any pre
11982 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11983 // that instead.
11984 SDNode *St;
11985 if (Subtarget->hasPAuth()) {
11986 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11987 } else {
11988 // XPACLRI operates on LR therefore we must move the operand accordingly.
11989 SDValue Chain =
11990 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11991 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11992 }
11993 return SDValue(St, 0);
11994}
11995
11996/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11997/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11998SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11999 SelectionDAG &DAG) const {
12000 SDValue Lo, Hi;
12001 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12002 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12003}
12004
12006 const GlobalAddressSDNode *GA) const {
12007 // Offsets are folded in the DAG combine rather than here so that we can
12008 // intelligently choose an offset based on the uses.
12009 return false;
12010}
12011
12013 bool OptForSize) const {
12014 bool IsLegal = false;
12015 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12016 // 16-bit case when target has full fp16 support.
12017 // We encode bf16 bit patterns as if they were fp16. This results in very
12018 // strange looking assembly but should populate the register with appropriate
12019 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12020 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12021 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12022 // FIXME: We should be able to handle f128 as well with a clever lowering.
12023 const APInt ImmInt = Imm.bitcastToAPInt();
12024 if (VT == MVT::f64)
12025 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12026 else if (VT == MVT::f32)
12027 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12028 else if (VT == MVT::f16 || VT == MVT::bf16)
12029 IsLegal =
12030 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12031 Imm.isPosZero();
12032
12033 // If we can not materialize in immediate field for fmov, check if the
12034 // value can be encoded as the immediate operand of a logical instruction.
12035 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12036 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12037 // generate that fmov.
12038 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12039 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12040 // however the mov+fmov sequence is always better because of the reduced
12041 // cache pressure. The timings are still the same if you consider
12042 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12043 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12046 assert(Insn.size() <= 4 &&
12047 "Should be able to build any value with at most 4 moves");
12048 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12049 IsLegal = Insn.size() <= Limit;
12050 }
12051
12052 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12053 << " imm value: "; Imm.dump(););
12054 return IsLegal;
12055}
12056
12057//===----------------------------------------------------------------------===//
12058// AArch64 Optimization Hooks
12059//===----------------------------------------------------------------------===//
12060
12061static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12062 SDValue Operand, SelectionDAG &DAG,
12063 int &ExtraSteps) {
12064 EVT VT = Operand.getValueType();
12065 if ((ST->hasNEON() &&
12066 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12067 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12068 VT == MVT::v4f32)) ||
12069 (ST->hasSVE() &&
12070 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12072 // For the reciprocal estimates, convergence is quadratic, so the number
12073 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12074 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12075 // the result for float (23 mantissa bits) is 2 and for double (52
12076 // mantissa bits) is 3.
12077 constexpr unsigned AccurateBits = 8;
12078 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12079 ExtraSteps = DesiredBits <= AccurateBits
12080 ? 0
12081 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12082 }
12083
12084 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12085 }
12086
12087 return SDValue();
12088}
12089
12090SDValue
12091AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12092 const DenormalMode &Mode) const {
12093 SDLoc DL(Op);
12094 EVT VT = Op.getValueType();
12095 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12096 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12097 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12098}
12099
12100SDValue
12101AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12102 SelectionDAG &DAG) const {
12103 return Op;
12104}
12105
12106SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12107 SelectionDAG &DAG, int Enabled,
12108 int &ExtraSteps,
12109 bool &UseOneConst,
12110 bool Reciprocal) const {
12112 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12113 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12114 DAG, ExtraSteps)) {
12115 SDLoc DL(Operand);
12116 EVT VT = Operand.getValueType();
12117
12119
12120 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12121 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12122 for (int i = ExtraSteps; i > 0; --i) {
12123 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12124 Flags);
12125 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12126 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12127 }
12128 if (!Reciprocal)
12129 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12130
12131 ExtraSteps = 0;
12132 return Estimate;
12133 }
12134
12135 return SDValue();
12136}
12137
12138SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12139 SelectionDAG &DAG, int Enabled,
12140 int &ExtraSteps) const {
12142 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12143 DAG, ExtraSteps)) {
12144 SDLoc DL(Operand);
12145 EVT VT = Operand.getValueType();
12146
12148
12149 // Newton reciprocal iteration: E * (2 - X * E)
12150 // AArch64 reciprocal iteration instruction: (2 - M * N)
12151 for (int i = ExtraSteps; i > 0; --i) {
12152 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12153 Estimate, Flags);
12154 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12155 }
12156
12157 ExtraSteps = 0;
12158 return Estimate;
12159 }
12160
12161 return SDValue();
12162}
12163
12164//===----------------------------------------------------------------------===//
12165// AArch64 Inline Assembly Support
12166//===----------------------------------------------------------------------===//
12167
12168// Table of Constraints
12169// TODO: This is the current set of constraints supported by ARM for the
12170// compiler, not all of them may make sense.
12171//
12172// r - A general register
12173// w - An FP/SIMD register of some size in the range v0-v31
12174// x - An FP/SIMD register of some size in the range v0-v15
12175// I - Constant that can be used with an ADD instruction
12176// J - Constant that can be used with a SUB instruction
12177// K - Constant that can be used with a 32-bit logical instruction
12178// L - Constant that can be used with a 64-bit logical instruction
12179// M - Constant that can be used as a 32-bit MOV immediate
12180// N - Constant that can be used as a 64-bit MOV immediate
12181// Q - A memory reference with base register and no offset
12182// S - A symbolic address
12183// Y - Floating point constant zero
12184// Z - Integer constant zero
12185//
12186// Note that general register operands will be output using their 64-bit x
12187// register name, whatever the size of the variable, unless the asm operand
12188// is prefixed by the %w modifier. Floating-point and SIMD register operands
12189// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12190// %q modifier.
12191const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12192 // At this point, we have to lower this constraint to something else, so we
12193 // lower it to an "r" or "w". However, by doing this we will force the result
12194 // to be in register, while the X constraint is much more permissive.
12195 //
12196 // Although we are correct (we are free to emit anything, without
12197 // constraints), we might break use cases that would expect us to be more
12198 // efficient and emit something else.
12199 if (!Subtarget->hasFPARMv8())
12200 return "r";
12201
12202 if (ConstraintVT.isFloatingPoint())
12203 return "w";
12204
12205 if (ConstraintVT.isVector() &&
12206 (ConstraintVT.getSizeInBits() == 64 ||
12207 ConstraintVT.getSizeInBits() == 128))
12208 return "w";
12209
12210 return "r";
12211}
12212
12214
12215// Returns a {Reg, RegisterClass} tuple if the constraint is
12216// a specific predicate register.
12217//
12218// For some constraint like "{pn3}" the default path in
12219// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12220// suitable register class for this register is "PPRorPNR", after which it
12221// determines that nxv16i1 is an appropriate type for the constraint, which is
12222// not what we want. The code here pre-empts this by matching the register
12223// explicitly.
12224static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12226 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12227 Constraint[1] != 'p')
12228 return std::nullopt;
12229
12230 Constraint = Constraint.substr(2, Constraint.size() - 3);
12231 bool IsPredicateAsCount = Constraint.starts_with("n");
12232 if (IsPredicateAsCount)
12233 Constraint = Constraint.drop_front(1);
12234
12235 unsigned V;
12236 if (Constraint.getAsInteger(10, V) || V > 31)
12237 return std::nullopt;
12238
12239 if (IsPredicateAsCount)
12240 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12241 else
12242 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12243}
12244
12245static std::optional<PredicateConstraint>
12248 .Case("Uph", PredicateConstraint::Uph)
12249 .Case("Upl", PredicateConstraint::Upl)
12250 .Case("Upa", PredicateConstraint::Upa)
12251 .Default(std::nullopt);
12252}
12253
12254static const TargetRegisterClass *
12256 if (VT != MVT::aarch64svcount &&
12257 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12258 return nullptr;
12259
12260 switch (Constraint) {
12261 case PredicateConstraint::Uph:
12262 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12263 : &AArch64::PPR_p8to15RegClass;
12264 case PredicateConstraint::Upl:
12265 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12266 : &AArch64::PPR_3bRegClass;
12267 case PredicateConstraint::Upa:
12268 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12269 : &AArch64::PPRRegClass;
12270 }
12271
12272 llvm_unreachable("Missing PredicateConstraint!");
12273}
12274
12276
12277static std::optional<ReducedGprConstraint>
12280 .Case("Uci", ReducedGprConstraint::Uci)
12281 .Case("Ucj", ReducedGprConstraint::Ucj)
12282 .Default(std::nullopt);
12283}
12284
12285static const TargetRegisterClass *
12287 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12288 return nullptr;
12289
12290 switch (Constraint) {
12291 case ReducedGprConstraint::Uci:
12292 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12293 case ReducedGprConstraint::Ucj:
12294 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12295 }
12296
12297 llvm_unreachable("Missing ReducedGprConstraint!");
12298}
12299
12300// The set of cc code supported is from
12301// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12304 .Case("{@cchi}", AArch64CC::HI)
12305 .Case("{@cccs}", AArch64CC::HS)
12306 .Case("{@cclo}", AArch64CC::LO)
12307 .Case("{@ccls}", AArch64CC::LS)
12308 .Case("{@cccc}", AArch64CC::LO)
12309 .Case("{@cceq}", AArch64CC::EQ)
12310 .Case("{@ccgt}", AArch64CC::GT)
12311 .Case("{@ccge}", AArch64CC::GE)
12312 .Case("{@cclt}", AArch64CC::LT)
12313 .Case("{@ccle}", AArch64CC::LE)
12314 .Case("{@cchs}", AArch64CC::HS)
12315 .Case("{@ccne}", AArch64CC::NE)
12316 .Case("{@ccvc}", AArch64CC::VC)
12317 .Case("{@ccpl}", AArch64CC::PL)
12318 .Case("{@ccvs}", AArch64CC::VS)
12319 .Case("{@ccmi}", AArch64CC::MI)
12321 return Cond;
12322}
12323
12324/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12325/// WZR, invert(<cond>)'.
12327 SelectionDAG &DAG) {
12328 return DAG.getNode(
12329 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
12330 DAG.getConstant(0, DL, MVT::i32),
12331 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
12332}
12333
12334// Lower @cc flag output via getSETCC.
12335SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12336 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12337 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12338 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12339 if (Cond == AArch64CC::Invalid)
12340 return SDValue();
12341 // The output variable should be a scalar integer.
12342 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12343 OpInfo.ConstraintVT.getSizeInBits() < 8)
12344 report_fatal_error("Flag output operand is of invalid type");
12345
12346 // Get NZCV register. Only update chain when copyfrom is glued.
12347 if (Glue.getNode()) {
12348 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
12349 Chain = Glue.getValue(1);
12350 } else
12351 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
12352 // Extract CC code.
12353 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12354
12356
12357 // Truncate or ZERO_EXTEND based on value types.
12358 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12359 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12360 else
12361 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12362
12363 return Result;
12364}
12365
12366/// getConstraintType - Given a constraint letter, return the type of
12367/// constraint it is for this target.
12369AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12370 if (Constraint.size() == 1) {
12371 switch (Constraint[0]) {
12372 default:
12373 break;
12374 case 'x':
12375 case 'w':
12376 case 'y':
12377 return C_RegisterClass;
12378 // An address with a single base register. Due to the way we
12379 // currently handle addresses it is the same as 'r'.
12380 case 'Q':
12381 return C_Memory;
12382 case 'I':
12383 case 'J':
12384 case 'K':
12385 case 'L':
12386 case 'M':
12387 case 'N':
12388 case 'Y':
12389 case 'Z':
12390 return C_Immediate;
12391 case 'z':
12392 case 'S': // A symbol or label reference with a constant offset
12393 return C_Other;
12394 }
12395 } else if (parsePredicateConstraint(Constraint))
12396 return C_RegisterClass;
12397 else if (parseReducedGprConstraint(Constraint))
12398 return C_RegisterClass;
12399 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12400 return C_Other;
12401 return TargetLowering::getConstraintType(Constraint);
12402}
12403
12404/// Examine constraint type and operand type and determine a weight value.
12405/// This object must already have been set up with the operand type
12406/// and the current alternative constraint selected.
12408AArch64TargetLowering::getSingleConstraintMatchWeight(
12409 AsmOperandInfo &info, const char *constraint) const {
12411 Value *CallOperandVal = info.CallOperandVal;
12412 // If we don't have a value, we can't do a match,
12413 // but allow it at the lowest weight.
12414 if (!CallOperandVal)
12415 return CW_Default;
12416 Type *type = CallOperandVal->getType();
12417 // Look at the constraint type.
12418 switch (*constraint) {
12419 default:
12421 break;
12422 case 'x':
12423 case 'w':
12424 case 'y':
12425 if (type->isFloatingPointTy() || type->isVectorTy())
12426 weight = CW_Register;
12427 break;
12428 case 'z':
12429 weight = CW_Constant;
12430 break;
12431 case 'U':
12432 if (parsePredicateConstraint(constraint) ||
12433 parseReducedGprConstraint(constraint))
12434 weight = CW_Register;
12435 break;
12436 }
12437 return weight;
12438}
12439
12440std::pair<unsigned, const TargetRegisterClass *>
12441AArch64TargetLowering::getRegForInlineAsmConstraint(
12442 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12443 if (Constraint.size() == 1) {
12444 switch (Constraint[0]) {
12445 case 'r':
12446 if (VT.isScalableVector())
12447 return std::make_pair(0U, nullptr);
12448 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12449 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
12450 if (VT.getFixedSizeInBits() == 64)
12451 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
12452 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
12453 case 'w': {
12454 if (!Subtarget->hasFPARMv8())
12455 break;
12456 if (VT.isScalableVector()) {
12457 if (VT.getVectorElementType() != MVT::i1)
12458 return std::make_pair(0U, &AArch64::ZPRRegClass);
12459 return std::make_pair(0U, nullptr);
12460 }
12461 if (VT == MVT::Other)
12462 break;
12463 uint64_t VTSize = VT.getFixedSizeInBits();
12464 if (VTSize == 16)
12465 return std::make_pair(0U, &AArch64::FPR16RegClass);
12466 if (VTSize == 32)
12467 return std::make_pair(0U, &AArch64::FPR32RegClass);
12468 if (VTSize == 64)
12469 return std::make_pair(0U, &AArch64::FPR64RegClass);
12470 if (VTSize == 128)
12471 return std::make_pair(0U, &AArch64::FPR128RegClass);
12472 break;
12473 }
12474 // The instructions that this constraint is designed for can
12475 // only take 128-bit registers so just use that regclass.
12476 case 'x':
12477 if (!Subtarget->hasFPARMv8())
12478 break;
12479 if (VT.isScalableVector())
12480 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
12481 if (VT.getSizeInBits() == 128)
12482 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
12483 break;
12484 case 'y':
12485 if (!Subtarget->hasFPARMv8())
12486 break;
12487 if (VT.isScalableVector())
12488 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
12489 break;
12490 }
12491 } else {
12492 if (const auto P = parsePredicateRegAsConstraint(Constraint))
12493 return *P;
12494 if (const auto PC = parsePredicateConstraint(Constraint))
12495 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
12496 return std::make_pair(0U, RegClass);
12497
12498 if (const auto RGC = parseReducedGprConstraint(Constraint))
12499 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
12500 return std::make_pair(0U, RegClass);
12501 }
12502 if (StringRef("{cc}").equals_insensitive(Constraint) ||
12504 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
12505
12506 if (Constraint == "{za}") {
12507 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
12508 }
12509
12510 if (Constraint == "{zt0}") {
12511 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
12512 }
12513
12514 // Use the default implementation in TargetLowering to convert the register
12515 // constraint into a member of a register class.
12516 std::pair<unsigned, const TargetRegisterClass *> Res;
12518
12519 // Not found as a standard register?
12520 if (!Res.second) {
12521 unsigned Size = Constraint.size();
12522 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12523 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12524 int RegNo;
12525 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
12526 if (!Failed && RegNo >= 0 && RegNo <= 31) {
12527 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12528 // By default we'll emit v0-v31 for this unless there's a modifier where
12529 // we'll emit the correct register as well.
12530 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12531 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
12532 Res.second = &AArch64::FPR64RegClass;
12533 } else {
12534 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
12535 Res.second = &AArch64::FPR128RegClass;
12536 }
12537 }
12538 }
12539 }
12540
12541 if (Res.second && !Subtarget->hasFPARMv8() &&
12542 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
12543 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
12544 return std::make_pair(0U, nullptr);
12545
12546 return Res;
12547}
12548
12550 llvm::Type *Ty,
12551 bool AllowUnknown) const {
12552 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
12553 return EVT(MVT::i64x8);
12554
12555 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12556}
12557
12558/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12559/// vector. If it is invalid, don't add anything to Ops.
12560void AArch64TargetLowering::LowerAsmOperandForConstraint(
12561 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12562 SelectionDAG &DAG) const {
12563 SDValue Result;
12564
12565 // Currently only support length 1 constraints.
12566 if (Constraint.size() != 1)
12567 return;
12568
12569 char ConstraintLetter = Constraint[0];
12570 switch (ConstraintLetter) {
12571 default:
12572 break;
12573
12574 // This set of constraints deal with valid constants for various instructions.
12575 // Validate and return a target constant for them if we can.
12576 case 'z': {
12577 // 'z' maps to xzr or wzr so it needs an input of 0.
12578 if (!isNullConstant(Op))
12579 return;
12580
12581 if (Op.getValueType() == MVT::i64)
12582 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
12583 else
12584 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
12585 break;
12586 }
12587 case 'S':
12588 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
12589 // supported for PIC while "s" isn't, making "s" less useful. We implement
12590 // "S" but not "s".
12592 break;
12593
12594 case 'I':
12595 case 'J':
12596 case 'K':
12597 case 'L':
12598 case 'M':
12599 case 'N':
12600 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12601 if (!C)
12602 return;
12603
12604 // Grab the value and do some validation.
12605 uint64_t CVal = C->getZExtValue();
12606 switch (ConstraintLetter) {
12607 // The I constraint applies only to simple ADD or SUB immediate operands:
12608 // i.e. 0 to 4095 with optional shift by 12
12609 // The J constraint applies only to ADD or SUB immediates that would be
12610 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
12611 // instruction [or vice versa], in other words -1 to -4095 with optional
12612 // left shift by 12.
12613 case 'I':
12614 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
12615 break;
12616 return;
12617 case 'J': {
12618 uint64_t NVal = -C->getSExtValue();
12619 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
12620 CVal = C->getSExtValue();
12621 break;
12622 }
12623 return;
12624 }
12625 // The K and L constraints apply *only* to logical immediates, including
12626 // what used to be the MOVI alias for ORR (though the MOVI alias has now
12627 // been removed and MOV should be used). So these constraints have to
12628 // distinguish between bit patterns that are valid 32-bit or 64-bit
12629 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12630 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12631 // versa.
12632 case 'K':
12633 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12634 break;
12635 return;
12636 case 'L':
12637 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12638 break;
12639 return;
12640 // The M and N constraints are a superset of K and L respectively, for use
12641 // with the MOV (immediate) alias. As well as the logical immediates they
12642 // also match 32 or 64-bit immediates that can be loaded either using a
12643 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
12644 // (M) or 64-bit 0x1234000000000000 (N) etc.
12645 // As a note some of this code is liberally stolen from the asm parser.
12646 case 'M': {
12647 if (!isUInt<32>(CVal))
12648 return;
12649 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12650 break;
12651 if ((CVal & 0xFFFF) == CVal)
12652 break;
12653 if ((CVal & 0xFFFF0000ULL) == CVal)
12654 break;
12655 uint64_t NCVal = ~(uint32_t)CVal;
12656 if ((NCVal & 0xFFFFULL) == NCVal)
12657 break;
12658 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12659 break;
12660 return;
12661 }
12662 case 'N': {
12663 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12664 break;
12665 if ((CVal & 0xFFFFULL) == CVal)
12666 break;
12667 if ((CVal & 0xFFFF0000ULL) == CVal)
12668 break;
12669 if ((CVal & 0xFFFF00000000ULL) == CVal)
12670 break;
12671 if ((CVal & 0xFFFF000000000000ULL) == CVal)
12672 break;
12673 uint64_t NCVal = ~CVal;
12674 if ((NCVal & 0xFFFFULL) == NCVal)
12675 break;
12676 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12677 break;
12678 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
12679 break;
12680 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
12681 break;
12682 return;
12683 }
12684 default:
12685 return;
12686 }
12687
12688 // All assembler immediates are 64-bit integers.
12689 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
12690 break;
12691 }
12692
12693 if (Result.getNode()) {
12694 Ops.push_back(Result);
12695 return;
12696 }
12697
12698 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12699}
12700
12701//===----------------------------------------------------------------------===//
12702// AArch64 Advanced SIMD Support
12703//===----------------------------------------------------------------------===//
12704
12705/// WidenVector - Given a value in the V64 register class, produce the
12706/// equivalent value in the V128 register class.
12708 EVT VT = V64Reg.getValueType();
12709 unsigned NarrowSize = VT.getVectorNumElements();
12710 MVT EltTy = VT.getVectorElementType().getSimpleVT();
12711 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
12712 SDLoc DL(V64Reg);
12713
12714 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
12715 V64Reg, DAG.getConstant(0, DL, MVT::i64));
12716}
12717
12718/// getExtFactor - Determine the adjustment factor for the position when
12719/// generating an "extract from vector registers" instruction.
12720static unsigned getExtFactor(SDValue &V) {
12721 EVT EltType = V.getValueType().getVectorElementType();
12722 return EltType.getSizeInBits() / 8;
12723}
12724
12725// Check if a vector is built from one vector via extracted elements of
12726// another together with an AND mask, ensuring that all elements fit
12727// within range. This can be reconstructed using AND and NEON's TBL1.
12729 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12730 SDLoc dl(Op);
12731 EVT VT = Op.getValueType();
12732 assert(!VT.isScalableVector() &&
12733 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12734
12735 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
12736 // directly to TBL1.
12737 if (VT != MVT::v16i8 && VT != MVT::v8i8)
12738 return SDValue();
12739
12740 unsigned NumElts = VT.getVectorNumElements();
12741 assert((NumElts == 8 || NumElts == 16) &&
12742 "Need to have exactly 8 or 16 elements in vector.");
12743
12744 SDValue SourceVec;
12745 SDValue MaskSourceVec;
12746 SmallVector<SDValue, 16> AndMaskConstants;
12747
12748 for (unsigned i = 0; i < NumElts; ++i) {
12749 SDValue V = Op.getOperand(i);
12750 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12751 return SDValue();
12752
12753 SDValue OperandSourceVec = V.getOperand(0);
12754 if (!SourceVec)
12755 SourceVec = OperandSourceVec;
12756 else if (SourceVec != OperandSourceVec)
12757 return SDValue();
12758
12759 // This only looks at shuffles with elements that are
12760 // a) truncated by a constant AND mask extracted from a mask vector, or
12761 // b) extracted directly from a mask vector.
12762 SDValue MaskSource = V.getOperand(1);
12763 if (MaskSource.getOpcode() == ISD::AND) {
12764 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
12765 return SDValue();
12766
12767 AndMaskConstants.push_back(MaskSource.getOperand(1));
12768 MaskSource = MaskSource->getOperand(0);
12769 } else if (!AndMaskConstants.empty()) {
12770 // Either all or no operands should have an AND mask.
12771 return SDValue();
12772 }
12773
12774 // An ANY_EXTEND may be inserted between the AND and the source vector
12775 // extraction. We don't care about that, so we can just skip it.
12776 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
12777 MaskSource = MaskSource.getOperand(0);
12778
12779 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12780 return SDValue();
12781
12782 SDValue MaskIdx = MaskSource.getOperand(1);
12783 if (!isa<ConstantSDNode>(MaskIdx) ||
12784 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
12785 return SDValue();
12786
12787 // We only apply this if all elements come from the same vector with the
12788 // same vector type.
12789 if (!MaskSourceVec) {
12790 MaskSourceVec = MaskSource->getOperand(0);
12791 if (MaskSourceVec.getValueType() != VT)
12792 return SDValue();
12793 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
12794 return SDValue();
12795 }
12796 }
12797
12798 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12799 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12800 // insert, we know that the index in the mask must be smaller than the number
12801 // of elements in the source, or we would have an out-of-bounds access.
12802 if (NumElts == 8)
12803 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
12804 DAG.getUNDEF(VT));
12805
12806 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12807 if (!AndMaskConstants.empty())
12808 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
12809 DAG.getBuildVector(VT, dl, AndMaskConstants));
12810
12811 return DAG.getNode(
12813 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
12814 MaskSourceVec);
12815}
12816
12817// Gather data to see if the operation can be modelled as a
12818// shuffle in combination with VEXTs.
12820 SelectionDAG &DAG) const {
12821 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12822 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12823 SDLoc dl(Op);
12824 EVT VT = Op.getValueType();
12825 assert(!VT.isScalableVector() &&
12826 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12827 unsigned NumElts = VT.getVectorNumElements();
12828
12829 struct ShuffleSourceInfo {
12830 SDValue Vec;
12831 unsigned MinElt;
12832 unsigned MaxElt;
12833
12834 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12835 // be compatible with the shuffle we intend to construct. As a result
12836 // ShuffleVec will be some sliding window into the original Vec.
12837 SDValue ShuffleVec;
12838
12839 // Code should guarantee that element i in Vec starts at element "WindowBase
12840 // + i * WindowScale in ShuffleVec".
12841 int WindowBase;
12842 int WindowScale;
12843
12844 ShuffleSourceInfo(SDValue Vec)
12845 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12846 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12847
12848 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12849 };
12850
12851 // First gather all vectors used as an immediate source for this BUILD_VECTOR
12852 // node.
12854 for (unsigned i = 0; i < NumElts; ++i) {
12855 SDValue V = Op.getOperand(i);
12856 if (V.isUndef())
12857 continue;
12858 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12859 !isa<ConstantSDNode>(V.getOperand(1)) ||
12860 V.getOperand(0).getValueType().isScalableVector()) {
12861 LLVM_DEBUG(
12862 dbgs() << "Reshuffle failed: "
12863 "a shuffle can only come from building a vector from "
12864 "various elements of other fixed-width vectors, provided "
12865 "their indices are constant\n");
12866 return SDValue();
12867 }
12868
12869 // Add this element source to the list if it's not already there.
12870 SDValue SourceVec = V.getOperand(0);
12871 auto Source = find(Sources, SourceVec);
12872 if (Source == Sources.end())
12873 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
12874
12875 // Update the minimum and maximum lane number seen.
12876 unsigned EltNo = V.getConstantOperandVal(1);
12877 Source->MinElt = std::min(Source->MinElt, EltNo);
12878 Source->MaxElt = std::max(Source->MaxElt, EltNo);
12879 }
12880
12881 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12882 // better than moving to/from gpr registers for larger vectors.
12883 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12884 // Construct a mask for the tbl. We may need to adjust the index for types
12885 // larger than i8.
12887 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12888 for (unsigned I = 0; I < NumElts; ++I) {
12889 SDValue V = Op.getOperand(I);
12890 if (V.isUndef()) {
12891 for (unsigned OF = 0; OF < OutputFactor; OF++)
12892 Mask.push_back(-1);
12893 continue;
12894 }
12895 // Set the Mask lanes adjusted for the size of the input and output
12896 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12897 // output element, adjusted in their positions per input and output types.
12898 unsigned Lane = V.getConstantOperandVal(1);
12899 for (unsigned S = 0; S < Sources.size(); S++) {
12900 if (V.getOperand(0) == Sources[S].Vec) {
12901 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12902 unsigned InputBase = 16 * S + Lane * InputSize / 8;
12903 for (unsigned OF = 0; OF < OutputFactor; OF++)
12904 Mask.push_back(InputBase + OF);
12905 break;
12906 }
12907 }
12908 }
12909
12910 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12911 // v16i8, and the TBLMask
12912 SmallVector<SDValue, 16> TBLOperands;
12913 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
12914 ? Intrinsic::aarch64_neon_tbl3
12915 : Intrinsic::aarch64_neon_tbl4,
12916 dl, MVT::i32));
12917 for (unsigned i = 0; i < Sources.size(); i++) {
12918 SDValue Src = Sources[i].Vec;
12919 EVT SrcVT = Src.getValueType();
12920 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
12921 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12922 "Expected a legally typed vector");
12923 if (SrcVT.is64BitVector())
12924 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
12925 DAG.getUNDEF(MVT::v8i8));
12926 TBLOperands.push_back(Src);
12927 }
12928
12930 for (unsigned i = 0; i < Mask.size(); i++)
12931 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
12932 assert((Mask.size() == 8 || Mask.size() == 16) &&
12933 "Expected a v8i8 or v16i8 Mask");
12934 TBLOperands.push_back(
12935 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
12936
12937 SDValue Shuffle =
12939 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
12940 return DAG.getBitcast(VT, Shuffle);
12941 }
12942
12943 if (Sources.size() > 2) {
12944 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12945 << "sensible when at most two source vectors are "
12946 << "involved\n");
12947 return SDValue();
12948 }
12949
12950 // Find out the smallest element size among result and two sources, and use
12951 // it as element size to build the shuffle_vector.
12952 EVT SmallestEltTy = VT.getVectorElementType();
12953 for (auto &Source : Sources) {
12954 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12955 if (SrcEltTy.bitsLT(SmallestEltTy)) {
12956 SmallestEltTy = SrcEltTy;
12957 }
12958 }
12959 unsigned ResMultiplier =
12960 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12961 uint64_t VTSize = VT.getFixedSizeInBits();
12962 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12963 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
12964
12965 // If the source vector is too wide or too narrow, we may nevertheless be able
12966 // to construct a compatible shuffle either by concatenating it with UNDEF or
12967 // extracting a suitable range of elements.
12968 for (auto &Src : Sources) {
12969 EVT SrcVT = Src.ShuffleVec.getValueType();
12970
12971 TypeSize SrcVTSize = SrcVT.getSizeInBits();
12972 if (SrcVTSize == TypeSize::getFixed(VTSize))
12973 continue;
12974
12975 // This stage of the search produces a source with the same element type as
12976 // the original, but with a total width matching the BUILD_VECTOR output.
12977 EVT EltVT = SrcVT.getVectorElementType();
12978 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12979 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
12980
12981 if (SrcVTSize.getFixedValue() < VTSize) {
12982 assert(2 * SrcVTSize == VTSize);
12983 // We can pad out the smaller vector for free, so if it's part of a
12984 // shuffle...
12985 Src.ShuffleVec =
12986 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
12987 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
12988 continue;
12989 }
12990
12991 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
12992 LLVM_DEBUG(
12993 dbgs() << "Reshuffle failed: result vector too small to extract\n");
12994 return SDValue();
12995 }
12996
12997 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12998 LLVM_DEBUG(
12999 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13000 return SDValue();
13001 }
13002
13003 if (Src.MinElt >= NumSrcElts) {
13004 // The extraction can just take the second half
13005 Src.ShuffleVec =
13006 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
13007 DAG.getConstant(NumSrcElts, dl, MVT::i64));
13008 Src.WindowBase = -NumSrcElts;
13009 } else if (Src.MaxElt < NumSrcElts) {
13010 // The extraction can just take the first half
13011 Src.ShuffleVec =
13012 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
13013 DAG.getConstant(0, dl, MVT::i64));
13014 } else {
13015 // An actual VEXT is needed
13016 SDValue VEXTSrc1 =
13017 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
13018 DAG.getConstant(0, dl, MVT::i64));
13019 SDValue VEXTSrc2 =
13020 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
13021 DAG.getConstant(NumSrcElts, dl, MVT::i64));
13022 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13023
13024 if (!SrcVT.is64BitVector()) {
13025 LLVM_DEBUG(
13026 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13027 "for SVE vectors.");
13028 return SDValue();
13029 }
13030
13031 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
13032 VEXTSrc2,
13033 DAG.getConstant(Imm, dl, MVT::i32));
13034 Src.WindowBase = -Src.MinElt;
13035 }
13036 }
13037
13038 // Another possible incompatibility occurs from the vector element types. We
13039 // can fix this by bitcasting the source vectors to the same type we intend
13040 // for the shuffle.
13041 for (auto &Src : Sources) {
13042 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13043 if (SrcEltTy == SmallestEltTy)
13044 continue;
13045 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13046 if (DAG.getDataLayout().isBigEndian()) {
13047 Src.ShuffleVec =
13048 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
13049 } else {
13050 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
13051 }
13052 Src.WindowScale =
13053 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13054 Src.WindowBase *= Src.WindowScale;
13055 }
13056
13057 // Final check before we try to actually produce a shuffle.
13058 LLVM_DEBUG({
13059 for (auto Src : Sources)
13060 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13061 });
13062
13063 // The stars all align, our next step is to produce the mask for the shuffle.
13064 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13065 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13066 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13067 SDValue Entry = Op.getOperand(i);
13068 if (Entry.isUndef())
13069 continue;
13070
13071 auto Src = find(Sources, Entry.getOperand(0));
13072 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13073
13074 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13075 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13076 // segment.
13077 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13078 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13079 VT.getScalarSizeInBits());
13080 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13081
13082 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13083 // starting at the appropriate offset.
13084 int *LaneMask = &Mask[i * ResMultiplier];
13085
13086 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13087 ExtractBase += NumElts * (Src - Sources.begin());
13088 for (int j = 0; j < LanesDefined; ++j)
13089 LaneMask[j] = ExtractBase + j;
13090 }
13091
13092 // Final check before we try to produce nonsense...
13093 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13094 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13095 return SDValue();
13096 }
13097
13098 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13099 for (unsigned i = 0; i < Sources.size(); ++i)
13100 ShuffleOps[i] = Sources[i].ShuffleVec;
13101
13102 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
13103 ShuffleOps[1], Mask);
13104 SDValue V;
13105 if (DAG.getDataLayout().isBigEndian()) {
13106 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
13107 } else {
13108 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
13109 }
13110
13111 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13112 dbgs() << "Reshuffle, creating node: "; V.dump(););
13113
13114 return V;
13115}
13116
13117// check if an EXT instruction can handle the shuffle mask when the
13118// vector sources of the shuffle are the same.
13119static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13120 unsigned NumElts = VT.getVectorNumElements();
13121
13122 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13123 if (M[0] < 0)
13124 return false;
13125
13126 Imm = M[0];
13127
13128 // If this is a VEXT shuffle, the immediate value is the index of the first
13129 // element. The other shuffle indices must be the successive elements after
13130 // the first one.
13131 unsigned ExpectedElt = Imm;
13132 for (unsigned i = 1; i < NumElts; ++i) {
13133 // Increment the expected index. If it wraps around, just follow it
13134 // back to index zero and keep going.
13135 ++ExpectedElt;
13136 if (ExpectedElt == NumElts)
13137 ExpectedElt = 0;
13138
13139 if (M[i] < 0)
13140 continue; // ignore UNDEF indices
13141 if (ExpectedElt != static_cast<unsigned>(M[i]))
13142 return false;
13143 }
13144
13145 return true;
13146}
13147
13148// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13149// v4i32s. This is really a truncate, which we can construct out of (legal)
13150// concats and truncate nodes.
13152 if (V.getValueType() != MVT::v16i8)
13153 return SDValue();
13154 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13155
13156 for (unsigned X = 0; X < 4; X++) {
13157 // Check the first item in each group is an extract from lane 0 of a v4i32
13158 // or v4i16.
13159 SDValue BaseExt = V.getOperand(X * 4);
13160 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13161 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13162 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13163 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13164 BaseExt.getConstantOperandVal(1) != 0)
13165 return SDValue();
13166 SDValue Base = BaseExt.getOperand(0);
13167 // And check the other items are extracts from the same vector.
13168 for (unsigned Y = 1; Y < 4; Y++) {
13169 SDValue Ext = V.getOperand(X * 4 + Y);
13170 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13171 Ext.getOperand(0) != Base ||
13172 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13173 Ext.getConstantOperandVal(1) != Y)
13174 return SDValue();
13175 }
13176 }
13177
13178 // Turn the buildvector into a series of truncates and concates, which will
13179 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13180 // concat together to produce 2 v8i16. These are both truncated and concat
13181 // together.
13182 SDLoc DL(V);
13183 SDValue Trunc[4] = {
13184 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13185 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13186 for (SDValue &V : Trunc)
13187 if (V.getValueType() == MVT::v4i32)
13188 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13189 SDValue Concat0 =
13190 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13191 SDValue Concat1 =
13192 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13193 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13194 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13195 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13196}
13197
13198/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13199/// element width than the vector lane type. If that is the case the function
13200/// returns true and writes the value of the DUP instruction lane operand into
13201/// DupLaneOp
13202static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13203 unsigned &DupLaneOp) {
13204 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13205 "Only possible block sizes for wide DUP are: 16, 32, 64");
13206
13207 if (BlockSize <= VT.getScalarSizeInBits())
13208 return false;
13209 if (BlockSize % VT.getScalarSizeInBits() != 0)
13210 return false;
13211 if (VT.getSizeInBits() % BlockSize != 0)
13212 return false;
13213
13214 size_t SingleVecNumElements = VT.getVectorNumElements();
13215 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13216 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13217
13218 // We are looking for masks like
13219 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13220 // might be replaced by 'undefined'. BlockIndices will eventually contain
13221 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13222 // for the above examples)
13223 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13224 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13225 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13226 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13227 if (Elt < 0)
13228 continue;
13229 // For now we don't support shuffles that use the second operand
13230 if ((unsigned)Elt >= SingleVecNumElements)
13231 return false;
13232 if (BlockElts[I] < 0)
13233 BlockElts[I] = Elt;
13234 else if (BlockElts[I] != Elt)
13235 return false;
13236 }
13237
13238 // We found a candidate block (possibly with some undefs). It must be a
13239 // sequence of consecutive integers starting with a value divisible by
13240 // NumEltsPerBlock with some values possibly replaced by undef-s.
13241
13242 // Find first non-undef element
13243 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13244 assert(FirstRealEltIter != BlockElts.end() &&
13245 "Shuffle with all-undefs must have been caught by previous cases, "
13246 "e.g. isSplat()");
13247 if (FirstRealEltIter == BlockElts.end()) {
13248 DupLaneOp = 0;
13249 return true;
13250 }
13251
13252 // Index of FirstRealElt in BlockElts
13253 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13254
13255 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13256 return false;
13257 // BlockElts[0] must have the following value if it isn't undef:
13258 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13259
13260 // Check the first element
13261 if (Elt0 % NumEltsPerBlock != 0)
13262 return false;
13263 // Check that the sequence indeed consists of consecutive integers (modulo
13264 // undefs)
13265 for (size_t I = 0; I < NumEltsPerBlock; I++)
13266 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13267 return false;
13268
13269 DupLaneOp = Elt0 / NumEltsPerBlock;
13270 return true;
13271}
13272
13273// check if an EXT instruction can handle the shuffle mask when the
13274// vector sources of the shuffle are different.
13275static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13276 unsigned &Imm) {
13277 // Look for the first non-undef element.
13278 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13279
13280 // Benefit form APInt to handle overflow when calculating expected element.
13281 unsigned NumElts = VT.getVectorNumElements();
13282 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13283 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13284 /*implicitTrunc=*/true);
13285 // The following shuffle indices must be the successive elements after the
13286 // first real element.
13287 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13288 return Elt != ExpectedElt++ && Elt != -1;
13289 });
13290 if (FoundWrongElt)
13291 return false;
13292
13293 // The index of an EXT is the first element if it is not UNDEF.
13294 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13295 // value of the first element. E.g.
13296 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13297 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13298 // ExpectedElt is the last mask index plus 1.
13299 Imm = ExpectedElt.getZExtValue();
13300
13301 // There are two difference cases requiring to reverse input vectors.
13302 // For example, for vector <4 x i32> we have the following cases,
13303 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13304 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13305 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13306 // to reverse two input vectors.
13307 if (Imm < NumElts)
13308 ReverseEXT = true;
13309 else
13310 Imm -= NumElts;
13311
13312 return true;
13313}
13314
13315/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13316/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13317/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13318static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13319 unsigned NumElts = VT.getVectorNumElements();
13320 if (NumElts % 2 != 0)
13321 return false;
13322 WhichResult = (M[0] == 0 ? 0 : 1);
13323 unsigned Idx = WhichResult * NumElts / 2;
13324 for (unsigned i = 0; i != NumElts; i += 2) {
13325 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13326 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13327 return false;
13328 Idx += 1;
13329 }
13330
13331 return true;
13332}
13333
13334/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13335/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13336/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13337static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13338 unsigned Half = VT.getVectorNumElements() / 2;
13339 WhichResult = (M[0] == 0 ? 0 : 1);
13340 for (unsigned j = 0; j != 2; ++j) {
13341 unsigned Idx = WhichResult;
13342 for (unsigned i = 0; i != Half; ++i) {
13343 int MIdx = M[i + j * Half];
13344 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13345 return false;
13346 Idx += 2;
13347 }
13348 }
13349
13350 return true;
13351}
13352
13353/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13354/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13355/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13356static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13357 unsigned NumElts = VT.getVectorNumElements();
13358 if (NumElts % 2 != 0)
13359 return false;
13360 WhichResult = (M[0] == 0 ? 0 : 1);
13361 for (unsigned i = 0; i < NumElts; i += 2) {
13362 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13363 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13364 return false;
13365 }
13366 return true;
13367}
13368
13369static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13370 bool &DstIsLeft, int &Anomaly) {
13371 if (M.size() != static_cast<size_t>(NumInputElements))
13372 return false;
13373
13374 int NumLHSMatch = 0, NumRHSMatch = 0;
13375 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13376
13377 for (int i = 0; i < NumInputElements; ++i) {
13378 if (M[i] == -1) {
13379 ++NumLHSMatch;
13380 ++NumRHSMatch;
13381 continue;
13382 }
13383
13384 if (M[i] == i)
13385 ++NumLHSMatch;
13386 else
13387 LastLHSMismatch = i;
13388
13389 if (M[i] == i + NumInputElements)
13390 ++NumRHSMatch;
13391 else
13392 LastRHSMismatch = i;
13393 }
13394
13395 if (NumLHSMatch == NumInputElements - 1) {
13396 DstIsLeft = true;
13397 Anomaly = LastLHSMismatch;
13398 return true;
13399 } else if (NumRHSMatch == NumInputElements - 1) {
13400 DstIsLeft = false;
13401 Anomaly = LastRHSMismatch;
13402 return true;
13403 }
13404
13405 return false;
13406}
13407
13408static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13409 if (VT.getSizeInBits() != 128)
13410 return false;
13411
13412 unsigned NumElts = VT.getVectorNumElements();
13413
13414 for (int I = 0, E = NumElts / 2; I != E; I++) {
13415 if (Mask[I] != I)
13416 return false;
13417 }
13418
13419 int Offset = NumElts / 2;
13420 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13421 if (Mask[I] != I + SplitLHS * Offset)
13422 return false;
13423 }
13424
13425 return true;
13426}
13427
13429 SDLoc DL(Op);
13430 EVT VT = Op.getValueType();
13431 SDValue V0 = Op.getOperand(0);
13432 SDValue V1 = Op.getOperand(1);
13433 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13434
13437 return SDValue();
13438
13439 bool SplitV0 = V0.getValueSizeInBits() == 128;
13440
13441 if (!isConcatMask(Mask, VT, SplitV0))
13442 return SDValue();
13443
13444 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13445 if (SplitV0) {
13446 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
13447 DAG.getConstant(0, DL, MVT::i64));
13448 }
13449 if (V1.getValueSizeInBits() == 128) {
13450 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
13451 DAG.getConstant(0, DL, MVT::i64));
13452 }
13453 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
13454}
13455
13456/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13457/// the specified operations to build the shuffle. ID is the perfect-shuffle
13458//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13459//table entry and LHS/RHS are the immediate inputs for this stage of the
13460//shuffle.
13462 SDValue V2, unsigned PFEntry, SDValue LHS,
13463 SDValue RHS, SelectionDAG &DAG,
13464 const SDLoc &dl) {
13465 unsigned OpNum = (PFEntry >> 26) & 0x0F;
13466 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13467 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13468
13469 enum {
13470 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13471 OP_VREV,
13472 OP_VDUP0,
13473 OP_VDUP1,
13474 OP_VDUP2,
13475 OP_VDUP3,
13476 OP_VEXT1,
13477 OP_VEXT2,
13478 OP_VEXT3,
13479 OP_VUZPL, // VUZP, left result
13480 OP_VUZPR, // VUZP, right result
13481 OP_VZIPL, // VZIP, left result
13482 OP_VZIPR, // VZIP, right result
13483 OP_VTRNL, // VTRN, left result
13484 OP_VTRNR, // VTRN, right result
13485 OP_MOVLANE // Move lane. RHSID is the lane to move into
13486 };
13487
13488 if (OpNum == OP_COPY) {
13489 if (LHSID == (1 * 9 + 2) * 9 + 3)
13490 return LHS;
13491 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13492 return RHS;
13493 }
13494
13495 if (OpNum == OP_MOVLANE) {
13496 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13497 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13498 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13499 Elt = 3 - Elt;
13500 while (Elt > 0) {
13501 ID /= 9;
13502 Elt--;
13503 }
13504 return (ID % 9 == 8) ? -1 : ID % 9;
13505 };
13506
13507 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13508 // get the lane to move from the PFID, which is always from the
13509 // original vectors (V1 or V2).
13511 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
13512 EVT VT = OpLHS.getValueType();
13513 assert(RHSID < 8 && "Expected a lane index for RHSID!");
13514 unsigned ExtLane = 0;
13515 SDValue Input;
13516
13517 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13518 // convert into a higher type.
13519 if (RHSID & 0x4) {
13520 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13521 if (MaskElt == -1)
13522 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13523 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13524 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13525 Input = MaskElt < 2 ? V1 : V2;
13526 if (VT.getScalarSizeInBits() == 16) {
13527 Input = DAG.getBitcast(MVT::v2f32, Input);
13528 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
13529 } else {
13530 assert(VT.getScalarSizeInBits() == 32 &&
13531 "Expected 16 or 32 bit shuffle elemements");
13532 Input = DAG.getBitcast(MVT::v2f64, Input);
13533 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
13534 }
13535 } else {
13536 int MaskElt = getPFIDLane(ID, RHSID);
13537 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13538 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13539 Input = MaskElt < 4 ? V1 : V2;
13540 // Be careful about creating illegal types. Use f16 instead of i16.
13541 if (VT == MVT::v4i16) {
13542 Input = DAG.getBitcast(MVT::v4f16, Input);
13543 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
13544 }
13545 }
13548 Input, DAG.getVectorIdxConstant(ExtLane, dl));
13549 SDValue Ins =
13550 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
13551 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
13552 return DAG.getBitcast(VT, Ins);
13553 }
13554
13555 SDValue OpLHS, OpRHS;
13556 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
13557 RHS, DAG, dl);
13558 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
13559 RHS, DAG, dl);
13560 EVT VT = OpLHS.getValueType();
13561
13562 switch (OpNum) {
13563 default:
13564 llvm_unreachable("Unknown shuffle opcode!");
13565 case OP_VREV:
13566 // VREV divides the vector in half and swaps within the half.
13567 if (VT.getVectorElementType() == MVT::i32 ||
13568 VT.getVectorElementType() == MVT::f32)
13569 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
13570 // vrev <4 x i16> -> REV32
13571 if (VT.getVectorElementType() == MVT::i16 ||
13572 VT.getVectorElementType() == MVT::f16 ||
13573 VT.getVectorElementType() == MVT::bf16)
13574 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
13575 // vrev <4 x i8> -> REV16
13576 assert(VT.getVectorElementType() == MVT::i8);
13577 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
13578 case OP_VDUP0:
13579 case OP_VDUP1:
13580 case OP_VDUP2:
13581 case OP_VDUP3: {
13582 EVT EltTy = VT.getVectorElementType();
13583 unsigned Opcode;
13584 if (EltTy == MVT::i8)
13585 Opcode = AArch64ISD::DUPLANE8;
13586 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
13587 Opcode = AArch64ISD::DUPLANE16;
13588 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
13589 Opcode = AArch64ISD::DUPLANE32;
13590 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
13591 Opcode = AArch64ISD::DUPLANE64;
13592 else
13593 llvm_unreachable("Invalid vector element type?");
13594
13595 if (VT.getSizeInBits() == 64)
13596 OpLHS = WidenVector(OpLHS, DAG);
13597 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
13598 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
13599 }
13600 case OP_VEXT1:
13601 case OP_VEXT2:
13602 case OP_VEXT3: {
13603 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
13604 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
13605 DAG.getConstant(Imm, dl, MVT::i32));
13606 }
13607 case OP_VUZPL:
13608 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
13609 case OP_VUZPR:
13610 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
13611 case OP_VZIPL:
13612 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
13613 case OP_VZIPR:
13614 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
13615 case OP_VTRNL:
13616 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
13617 case OP_VTRNR:
13618 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
13619 }
13620}
13621
13623 SelectionDAG &DAG) {
13624 // Check to see if we can use the TBL instruction.
13625 SDValue V1 = Op.getOperand(0);
13626 SDValue V2 = Op.getOperand(1);
13627 SDLoc DL(Op);
13628
13629 EVT EltVT = Op.getValueType().getVectorElementType();
13630 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
13631
13632 bool Swap = false;
13633 if (V1.isUndef() || isZerosVector(V1.getNode())) {
13634 std::swap(V1, V2);
13635 Swap = true;
13636 }
13637
13638 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13639 // out of range values with 0s. We do need to make sure that any out-of-range
13640 // values are really out-of-range for a v16i8 vector.
13641 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
13642 MVT IndexVT = MVT::v8i8;
13643 unsigned IndexLen = 8;
13644 if (Op.getValueSizeInBits() == 128) {
13645 IndexVT = MVT::v16i8;
13646 IndexLen = 16;
13647 }
13648
13650 for (int Val : ShuffleMask) {
13651 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
13652 unsigned Offset = Byte + Val * BytesPerElt;
13653 if (Swap)
13654 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13655 if (IsUndefOrZero && Offset >= IndexLen)
13656 Offset = 255;
13657 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
13658 }
13659 }
13660
13661 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
13662 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
13663
13664 SDValue Shuffle;
13665 if (IsUndefOrZero) {
13666 if (IndexLen == 8)
13667 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
13668 Shuffle = DAG.getNode(
13669 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13670 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13671 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13672 } else {
13673 if (IndexLen == 8) {
13674 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
13675 Shuffle = DAG.getNode(
13676 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13677 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13678 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13679 } else {
13680 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13681 // cannot currently represent the register constraints on the input
13682 // table registers.
13683 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13684 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13685 // IndexLen));
13686 Shuffle = DAG.getNode(
13687 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13688 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
13689 V2Cst,
13690 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13691 }
13692 }
13693 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
13694}
13695
13696static unsigned getDUPLANEOp(EVT EltType) {
13697 if (EltType == MVT::i8)
13698 return AArch64ISD::DUPLANE8;
13699 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
13700 return AArch64ISD::DUPLANE16;
13701 if (EltType == MVT::i32 || EltType == MVT::f32)
13702 return AArch64ISD::DUPLANE32;
13703 if (EltType == MVT::i64 || EltType == MVT::f64)
13704 return AArch64ISD::DUPLANE64;
13705
13706 llvm_unreachable("Invalid vector element type?");
13707}
13708
13709static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
13710 unsigned Opcode, SelectionDAG &DAG) {
13711 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
13712 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
13713 // Match: dup (bitcast (extract_subv X, C)), LaneC
13714 if (BitCast.getOpcode() != ISD::BITCAST ||
13716 return false;
13717
13718 // The extract index must align in the destination type. That may not
13719 // happen if the bitcast is from narrow to wide type.
13720 SDValue Extract = BitCast.getOperand(0);
13721 unsigned ExtIdx = Extract.getConstantOperandVal(1);
13722 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
13723 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
13724 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
13725 if (ExtIdxInBits % CastedEltBitWidth != 0)
13726 return false;
13727
13728 // Can't handle cases where vector size is not 128-bit
13729 if (!Extract.getOperand(0).getValueType().is128BitVector())
13730 return false;
13731
13732 // Update the lane value by offsetting with the scaled extract index.
13733 LaneC += ExtIdxInBits / CastedEltBitWidth;
13734
13735 // Determine the casted vector type of the wide vector input.
13736 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
13737 // Examples:
13738 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
13739 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
13740 unsigned SrcVecNumElts =
13741 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
13743 SrcVecNumElts);
13744 return true;
13745 };
13746 MVT CastVT;
13747 if (getScaledOffsetDup(V, Lane, CastVT)) {
13748 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
13749 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13750 V.getOperand(0).getValueType().is128BitVector()) {
13751 // The lane is incremented by the index of the extract.
13752 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
13753 Lane += V.getConstantOperandVal(1);
13754 V = V.getOperand(0);
13755 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
13756 // The lane is decremented if we are splatting from the 2nd operand.
13757 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
13758 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
13759 Lane -= Idx * VT.getVectorNumElements() / 2;
13760 V = WidenVector(V.getOperand(Idx), DAG);
13761 } else if (VT.getSizeInBits() == 64) {
13762 // Widen the operand to 128-bit register with undef.
13763 V = WidenVector(V, DAG);
13764 }
13765 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
13766}
13767
13768// Try to widen element type to get a new mask value for a better permutation
13769// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13770// UZP1/2, TRN1/2, REV, INS, etc.
13771// For example:
13772// shufflevector <4 x i32> %a, <4 x i32> %b,
13773// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13774// is equivalent to:
13775// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13776// Finally, we can get:
13777// mov v0.d[0], v1.d[1]
13779 SDLoc DL(Op);
13780 EVT VT = Op.getValueType();
13781 EVT ScalarVT = VT.getVectorElementType();
13782 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13783 SDValue V0 = Op.getOperand(0);
13784 SDValue V1 = Op.getOperand(1);
13785 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13786
13787 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13788 // We need to make sure the wider element type is legal. Thus, ElementSize
13789 // should be not larger than 32 bits, and i1 type should also be excluded.
13790 if (ElementSize > 32 || ElementSize == 1)
13791 return SDValue();
13792
13793 SmallVector<int, 8> NewMask;
13794 if (widenShuffleMaskElts(Mask, NewMask)) {
13795 MVT NewEltVT = VT.isFloatingPoint()
13796 ? MVT::getFloatingPointVT(ElementSize * 2)
13797 : MVT::getIntegerVT(ElementSize * 2);
13798 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13799 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13800 V0 = DAG.getBitcast(NewVT, V0);
13801 V1 = DAG.getBitcast(NewVT, V1);
13802 return DAG.getBitcast(VT,
13803 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
13804 }
13805 }
13806
13807 return SDValue();
13808}
13809
13810// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13812 ArrayRef<int> ShuffleMask,
13813 SelectionDAG &DAG) {
13814 SDValue Tbl1 = Op->getOperand(0);
13815 SDValue Tbl2 = Op->getOperand(1);
13816 SDLoc dl(Op);
13817 SDValue Tbl2ID =
13818 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
13819
13820 EVT VT = Op.getValueType();
13821 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13822 Tbl1->getOperand(0) != Tbl2ID ||
13824 Tbl2->getOperand(0) != Tbl2ID)
13825 return SDValue();
13826
13827 if (Tbl1->getValueType(0) != MVT::v16i8 ||
13828 Tbl2->getValueType(0) != MVT::v16i8)
13829 return SDValue();
13830
13831 SDValue Mask1 = Tbl1->getOperand(3);
13832 SDValue Mask2 = Tbl2->getOperand(3);
13833 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13834 for (unsigned I = 0; I < 16; I++) {
13835 if (ShuffleMask[I] < 16)
13836 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
13837 else {
13838 auto *C =
13839 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
13840 if (!C)
13841 return SDValue();
13842 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
13843 }
13844 }
13845
13846 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
13847 SDValue ID =
13848 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
13849
13850 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
13851 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
13852 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
13853}
13854
13855// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13856// but we don't have an appropriate instruction,
13857// so custom-lower it as ZIP1-with-zeros.
13858SDValue
13859AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13860 SelectionDAG &DAG) const {
13861 SDLoc dl(Op);
13862 EVT VT = Op.getValueType();
13863 SDValue SrcOp = Op.getOperand(0);
13864 EVT SrcVT = SrcOp.getValueType();
13865 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13866 "Unexpected extension factor.");
13867 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13868 // FIXME: support multi-step zipping?
13869 if (Scale != 2)
13870 return SDValue();
13871 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
13872 return DAG.getBitcast(VT,
13873 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
13874}
13875
13876SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13877 SelectionDAG &DAG) const {
13878 SDLoc dl(Op);
13879 EVT VT = Op.getValueType();
13880
13881 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
13882
13883 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13884 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13885
13886 // Convert shuffles that are directly supported on NEON to target-specific
13887 // DAG nodes, instead of keeping them as shuffles and matching them again
13888 // during code selection. This is more efficient and avoids the possibility
13889 // of inconsistencies between legalization and selection.
13890 ArrayRef<int> ShuffleMask = SVN->getMask();
13891
13892 SDValue V1 = Op.getOperand(0);
13893 SDValue V2 = Op.getOperand(1);
13894
13895 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13896 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13897 "Unexpected VECTOR_SHUFFLE mask size!");
13898
13899 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13900 return Res;
13901
13902 if (SVN->isSplat()) {
13903 int Lane = SVN->getSplatIndex();
13904 // If this is undef splat, generate it via "just" vdup, if possible.
13905 if (Lane == -1)
13906 Lane = 0;
13907
13908 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13909 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
13910 V1.getOperand(0));
13911 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13912 // constant. If so, we can just reference the lane's definition directly.
13913 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13914 !isa<ConstantSDNode>(V1.getOperand(Lane)))
13915 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
13916
13917 // Otherwise, duplicate from the lane of the input vector.
13918 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
13919 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
13920 }
13921
13922 // Check if the mask matches a DUP for a wider element
13923 for (unsigned LaneSize : {64U, 32U, 16U}) {
13924 unsigned Lane = 0;
13925 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
13926 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
13927 : LaneSize == 32 ? AArch64ISD::DUPLANE32
13929 // Cast V1 to an integer vector with required lane size
13930 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
13931 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13932 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
13933 V1 = DAG.getBitcast(NewVecTy, V1);
13934 // Constuct the DUP instruction
13935 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
13936 // Cast back to the original type
13937 return DAG.getBitcast(VT, V1);
13938 }
13939 }
13940
13941 unsigned NumElts = VT.getVectorNumElements();
13942 unsigned EltSize = VT.getScalarSizeInBits();
13943 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
13944 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1);
13945 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
13946 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1);
13947 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
13948 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1);
13949
13950 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
13951 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
13952 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
13953 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
13954 DAG.getConstant(8, dl, MVT::i32));
13955 }
13956
13957 bool ReverseEXT = false;
13958 unsigned Imm;
13959 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
13960 if (ReverseEXT)
13961 std::swap(V1, V2);
13962 Imm *= getExtFactor(V1);
13963 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
13964 DAG.getConstant(Imm, dl, MVT::i32));
13965 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
13966 Imm *= getExtFactor(V1);
13967 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
13968 DAG.getConstant(Imm, dl, MVT::i32));
13969 }
13970
13971 unsigned WhichResult;
13972 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13973 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13974 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13975 }
13976 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13977 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13978 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13979 }
13980 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13981 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13982 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13983 }
13984
13985 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13986 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13987 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13988 }
13989 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13990 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13991 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13992 }
13993 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13994 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13995 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13996 }
13997
13999 return Concat;
14000
14001 bool DstIsLeft;
14002 int Anomaly;
14003 int NumInputElements = V1.getValueType().getVectorNumElements();
14004 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14005 SDValue DstVec = DstIsLeft ? V1 : V2;
14006 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
14007
14008 SDValue SrcVec = V1;
14009 int SrcLane = ShuffleMask[Anomaly];
14010 if (SrcLane >= NumInputElements) {
14011 SrcVec = V2;
14012 SrcLane -= NumElts;
14013 }
14014 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
14015
14016 EVT ScalarVT = VT.getVectorElementType();
14017
14018 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14019 ScalarVT = MVT::i32;
14020
14021 return DAG.getNode(
14022 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
14023 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
14024 DstLaneV);
14025 }
14026
14027 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14028 return NewSD;
14029
14030 // If the shuffle is not directly supported and it has 4 elements, use
14031 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14032 if (NumElts == 4) {
14033 unsigned PFIndexes[4];
14034 for (unsigned i = 0; i != 4; ++i) {
14035 if (ShuffleMask[i] < 0)
14036 PFIndexes[i] = 8;
14037 else
14038 PFIndexes[i] = ShuffleMask[i];
14039 }
14040
14041 // Compute the index in the perfect shuffle table.
14042 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14043 PFIndexes[2] * 9 + PFIndexes[3];
14044 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14045 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14046 dl);
14047 }
14048
14049 // Check for a "select shuffle", generating a BSL to pick between lanes in
14050 // V1/V2.
14051 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14052 assert(VT.getScalarSizeInBits() <= 32 &&
14053 "Expected larger vector element sizes to be handled already");
14054 SmallVector<SDValue> MaskElts;
14055 for (int M : ShuffleMask)
14056 MaskElts.push_back(DAG.getConstant(
14057 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, dl, MVT::i32));
14059 SDValue MaskConst = DAG.getBuildVector(IVT, dl, MaskElts);
14060 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, dl, IVT, MaskConst,
14061 DAG.getBitcast(IVT, V1),
14062 DAG.getBitcast(IVT, V2)));
14063 }
14064
14065 // Fall back to generating a TBL
14066 return GenerateTBL(Op, ShuffleMask, DAG);
14067}
14068
14069SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14070 SelectionDAG &DAG) const {
14071 EVT VT = Op.getValueType();
14072
14073 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14074 return LowerToScalableOp(Op, DAG);
14075
14076 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14077 "Unexpected vector type!");
14078
14079 // We can handle the constant cases during isel.
14080 if (isa<ConstantSDNode>(Op.getOperand(0)))
14081 return Op;
14082
14083 // There isn't a natural way to handle the general i1 case, so we use some
14084 // trickery with whilelo.
14085 SDLoc DL(Op);
14086 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14087 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14088 DAG.getValueType(MVT::i1));
14089 SDValue ID =
14090 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14091 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14092 if (VT == MVT::nxv1i1)
14093 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14094 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14095 Zero, SplatVal),
14096 Zero);
14097 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14098}
14099
14100SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14101 SelectionDAG &DAG) const {
14102 SDLoc DL(Op);
14103
14104 EVT VT = Op.getValueType();
14105 if (!isTypeLegal(VT) || !VT.isScalableVector())
14106 return SDValue();
14107
14108 // Current lowering only supports the SVE-ACLE types.
14110 return SDValue();
14111
14112 // The DUPQ operation is independent of element type so normalise to i64s.
14113 SDValue Idx128 = Op.getOperand(2);
14114
14115 // DUPQ can be used when idx is in range.
14116 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14117 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14118 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14119 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14120 }
14121
14122 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14123
14124 // The ACLE says this must produce the same result as:
14125 // svtbl(data, svadd_x(svptrue_b64(),
14126 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14127 // index * 2))
14128 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14129 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14130
14131 // create the vector 0,1,0,1,...
14132 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14133 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14134
14135 // create the vector idx64,idx64+1,idx64,idx64+1,...
14136 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14137 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14138 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14139
14140 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14141 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14142 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14143}
14144
14145
14146static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14147 APInt &UndefBits) {
14148 EVT VT = BVN->getValueType(0);
14149 APInt SplatBits, SplatUndef;
14150 unsigned SplatBitSize;
14151 bool HasAnyUndefs;
14152 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14153 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14154
14155 for (unsigned i = 0; i < NumSplats; ++i) {
14156 CnstBits <<= SplatBitSize;
14157 UndefBits <<= SplatBitSize;
14158 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14159 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14160 }
14161
14162 return true;
14163 }
14164
14165 return false;
14166}
14167
14168// Try 64-bit splatted SIMD immediate.
14169static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14170 const APInt &Bits) {
14171 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14172 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14173 EVT VT = Op.getValueType();
14174 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14175
14178
14179 SDLoc dl(Op);
14180 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14181 DAG.getConstant(Value, dl, MVT::i32));
14182 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14183 }
14184 }
14185
14186 return SDValue();
14187}
14188
14189// Try 32-bit splatted SIMD immediate.
14190static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14191 const APInt &Bits,
14192 const SDValue *LHS = nullptr) {
14193 EVT VT = Op.getValueType();
14194 if (VT.isFixedLengthVector() &&
14196 return SDValue();
14197
14198 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14199 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14200 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14201 bool isAdvSIMDModImm = false;
14202 uint64_t Shift;
14203
14204 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14206 Shift = 0;
14207 }
14208 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14210 Shift = 8;
14211 }
14212 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14214 Shift = 16;
14215 }
14216 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14218 Shift = 24;
14219 }
14220
14221 if (isAdvSIMDModImm) {
14222 SDLoc dl(Op);
14223 SDValue Mov;
14224
14225 if (LHS)
14226 Mov = DAG.getNode(NewOp, dl, MovTy,
14227 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14228 DAG.getConstant(Value, dl, MVT::i32),
14229 DAG.getConstant(Shift, dl, MVT::i32));
14230 else
14231 Mov = DAG.getNode(NewOp, dl, MovTy,
14232 DAG.getConstant(Value, dl, MVT::i32),
14233 DAG.getConstant(Shift, dl, MVT::i32));
14234
14235 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14236 }
14237 }
14238
14239 return SDValue();
14240}
14241
14242// Try 16-bit splatted SIMD immediate.
14243static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14244 const APInt &Bits,
14245 const SDValue *LHS = nullptr) {
14246 EVT VT = Op.getValueType();
14247 if (VT.isFixedLengthVector() &&
14249 return SDValue();
14250
14251 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14252 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14253 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14254 bool isAdvSIMDModImm = false;
14255 uint64_t Shift;
14256
14257 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14259 Shift = 0;
14260 }
14261 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14263 Shift = 8;
14264 }
14265
14266 if (isAdvSIMDModImm) {
14267 SDLoc dl(Op);
14268 SDValue Mov;
14269
14270 if (LHS)
14271 Mov = DAG.getNode(NewOp, dl, MovTy,
14272 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14273 DAG.getConstant(Value, dl, MVT::i32),
14274 DAG.getConstant(Shift, dl, MVT::i32));
14275 else
14276 Mov = DAG.getNode(NewOp, dl, MovTy,
14277 DAG.getConstant(Value, dl, MVT::i32),
14278 DAG.getConstant(Shift, dl, MVT::i32));
14279
14280 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14281 }
14282 }
14283
14284 return SDValue();
14285}
14286
14287// Try 32-bit splatted SIMD immediate with shifted ones.
14289 SelectionDAG &DAG, const APInt &Bits) {
14290 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14291 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14292 EVT VT = Op.getValueType();
14293 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14294 bool isAdvSIMDModImm = false;
14295 uint64_t Shift;
14296
14297 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14299 Shift = 264;
14300 }
14301 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14303 Shift = 272;
14304 }
14305
14306 if (isAdvSIMDModImm) {
14307 SDLoc dl(Op);
14308 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14309 DAG.getConstant(Value, dl, MVT::i32),
14310 DAG.getConstant(Shift, dl, MVT::i32));
14311 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14312 }
14313 }
14314
14315 return SDValue();
14316}
14317
14318// Try 8-bit splatted SIMD immediate.
14319static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14320 const APInt &Bits) {
14321 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14322 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14323 EVT VT = Op.getValueType();
14324 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14325
14328
14329 SDLoc dl(Op);
14330 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14331 DAG.getConstant(Value, dl, MVT::i32));
14332 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14333 }
14334 }
14335
14336 return SDValue();
14337}
14338
14339// Try FP splatted SIMD immediate.
14340static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14341 const APInt &Bits) {
14342 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14343 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14344 EVT VT = Op.getValueType();
14345 bool isWide = (VT.getSizeInBits() == 128);
14346 MVT MovTy;
14347 bool isAdvSIMDModImm = false;
14348
14349 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14351 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14352 }
14353 else if (isWide &&
14354 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14356 MovTy = MVT::v2f64;
14357 }
14358
14359 if (isAdvSIMDModImm) {
14360 SDLoc dl(Op);
14361 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14362 DAG.getConstant(Value, dl, MVT::i32));
14363 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14364 }
14365 }
14366
14367 return SDValue();
14368}
14369
14370// Specialized code to quickly find if PotentialBVec is a BuildVector that
14371// consists of only the same constant int value, returned in reference arg
14372// ConstVal
14373static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14374 uint64_t &ConstVal) {
14375 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14376 if (!Bvec)
14377 return false;
14378 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
14379 if (!FirstElt)
14380 return false;
14381 EVT VT = Bvec->getValueType(0);
14382 unsigned NumElts = VT.getVectorNumElements();
14383 for (unsigned i = 1; i < NumElts; ++i)
14384 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14385 return false;
14386 ConstVal = FirstElt->getZExtValue();
14387 return true;
14388}
14389
14391 // Look through cast.
14392 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14393 N = N.getOperand(0);
14394
14395 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14396}
14397
14399 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14400
14401 // Look through cast.
14402 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14403 N = N.getOperand(0);
14404 // When reinterpreting from a type with fewer elements the "new" elements
14405 // are not active, so bail if they're likely to be used.
14406 if (N.getValueType().getVectorMinNumElements() < NumElts)
14407 return false;
14408 }
14409
14410 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14411 return true;
14412
14413 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14414 // or smaller than the implicit element type represented by N.
14415 // NOTE: A larger element count implies a smaller element type.
14416 if (N.getOpcode() == AArch64ISD::PTRUE &&
14417 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14418 return N.getValueType().getVectorMinNumElements() >= NumElts;
14419
14420 // If we're compiling for a specific vector-length, we can check if the
14421 // pattern's VL equals that of the scalable vector at runtime.
14422 if (N.getOpcode() == AArch64ISD::PTRUE) {
14423 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14424 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14425 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14426 if (MaxSVESize && MinSVESize == MaxSVESize) {
14427 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14428 unsigned PatNumElts =
14429 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14430 return PatNumElts == (NumElts * VScale);
14431 }
14432 }
14433
14434 return false;
14435}
14436
14437// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14438// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14439// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14440// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14441// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14442// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14444 EVT VT = N->getValueType(0);
14445
14446 if (!VT.isVector())
14447 return SDValue();
14448
14449 SDLoc DL(N);
14450
14451 SDValue And;
14452 SDValue Shift;
14453
14454 SDValue FirstOp = N->getOperand(0);
14455 unsigned FirstOpc = FirstOp.getOpcode();
14456 SDValue SecondOp = N->getOperand(1);
14457 unsigned SecondOpc = SecondOp.getOpcode();
14458
14459 // Is one of the operands an AND or a BICi? The AND may have been optimised to
14460 // a BICi in order to use an immediate instead of a register.
14461 // Is the other operand an shl or lshr? This will have been turned into:
14462 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14463 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14464 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14465 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14466 SecondOpc == AArch64ISD::SHL_PRED ||
14467 SecondOpc == AArch64ISD::SRL_PRED)) {
14468 And = FirstOp;
14469 Shift = SecondOp;
14470
14471 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14472 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14473 FirstOpc == AArch64ISD::SHL_PRED ||
14474 FirstOpc == AArch64ISD::SRL_PRED)) {
14475 And = SecondOp;
14476 Shift = FirstOp;
14477 } else
14478 return SDValue();
14479
14480 bool IsAnd = And.getOpcode() == ISD::AND;
14481 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14483 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14485
14486 // Is the shift amount constant and are all lanes active?
14487 uint64_t C2;
14488 if (ShiftHasPredOp) {
14489 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
14490 return SDValue();
14491 APInt C;
14493 return SDValue();
14494 C2 = C.getZExtValue();
14495 } else if (ConstantSDNode *C2node =
14496 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
14497 C2 = C2node->getZExtValue();
14498 else
14499 return SDValue();
14500
14501 APInt C1AsAPInt;
14502 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14503 if (IsAnd) {
14504 // Is the and mask vector all constant?
14505 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
14506 return SDValue();
14507 } else {
14508 // Reconstruct the corresponding AND immediate from the two BICi immediates.
14509 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
14510 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
14511 assert(C1nodeImm && C1nodeShift);
14512 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14513 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
14514 }
14515
14516 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14517 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14518 // how much one can shift elements of a particular size?
14519 if (C2 > ElemSizeInBits)
14520 return SDValue();
14521
14522 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
14523 : APInt::getLowBitsSet(ElemSizeInBits, C2);
14524 if (C1AsAPInt != RequiredC1)
14525 return SDValue();
14526
14527 SDValue X = And.getOperand(0);
14528 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
14529 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
14530 : Shift.getOperand(1);
14531
14532 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14533 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
14534
14535 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
14536 LLVM_DEBUG(N->dump(&DAG));
14537 LLVM_DEBUG(dbgs() << "into: \n");
14538 LLVM_DEBUG(ResultSLI->dump(&DAG));
14539
14540 ++NumShiftInserts;
14541 return ResultSLI;
14542}
14543
14544SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14545 SelectionDAG &DAG) const {
14546 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14547 !Subtarget->isNeonAvailable()))
14548 return LowerToScalableOp(Op, DAG);
14549
14550 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14551 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
14552 return Res;
14553
14554 EVT VT = Op.getValueType();
14555 if (VT.isScalableVector())
14556 return Op;
14557
14558 SDValue LHS = Op.getOperand(0);
14559 BuildVectorSDNode *BVN =
14560 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
14561 if (!BVN) {
14562 // OR commutes, so try swapping the operands.
14563 LHS = Op.getOperand(1);
14564 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
14565 }
14566 if (!BVN)
14567 return Op;
14568
14569 APInt DefBits(VT.getSizeInBits(), 0);
14570 APInt UndefBits(VT.getSizeInBits(), 0);
14571 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14572 SDValue NewOp;
14573
14574 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14575 DefBits, &LHS)) ||
14576 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14577 DefBits, &LHS)))
14578 return NewOp;
14579
14580 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14581 UndefBits, &LHS)) ||
14582 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14583 UndefBits, &LHS)))
14584 return NewOp;
14585 }
14586
14587 // We can always fall back to a non-immediate OR.
14588 return Op;
14589}
14590
14591// Normalize the operands of BUILD_VECTOR. The value of constant operands will
14592// be truncated to fit element width.
14594 SelectionDAG &DAG) {
14595 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14596 SDLoc dl(Op);
14597 EVT VT = Op.getValueType();
14598 EVT EltTy= VT.getVectorElementType();
14599
14600 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
14601 return Op;
14602
14604 for (SDValue Lane : Op->ops()) {
14605 // For integer vectors, type legalization would have promoted the
14606 // operands already. Otherwise, if Op is a floating-point splat
14607 // (with operands cast to integers), then the only possibilities
14608 // are constants and UNDEFs.
14609 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
14610 Lane = DAG.getConstant(
14611 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
14612 dl, MVT::i32);
14613 } else if (Lane.getNode()->isUndef()) {
14614 Lane = DAG.getUNDEF(MVT::i32);
14615 } else {
14616 assert(Lane.getValueType() == MVT::i32 &&
14617 "Unexpected BUILD_VECTOR operand type");
14618 }
14619 Ops.push_back(Lane);
14620 }
14621 return DAG.getBuildVector(VT, dl, Ops);
14622}
14623
14625 const AArch64Subtarget *ST) {
14626 EVT VT = Op.getValueType();
14627 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
14628 "Expected a legal NEON vector");
14629
14630 APInt DefBits(VT.getSizeInBits(), 0);
14631 APInt UndefBits(VT.getSizeInBits(), 0);
14632 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14633 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14634 auto TryMOVIWithBits = [&](APInt DefBits) {
14635 SDValue NewOp;
14636 if ((NewOp =
14637 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
14638 (NewOp =
14639 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14640 (NewOp =
14641 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
14642 (NewOp =
14643 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14644 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
14645 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
14646 return NewOp;
14647
14648 APInt NotDefBits = ~DefBits;
14649 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
14650 NotDefBits)) ||
14652 NotDefBits)) ||
14653 (NewOp =
14654 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
14655 return NewOp;
14656 return SDValue();
14657 };
14658 if (SDValue R = TryMOVIWithBits(DefBits))
14659 return R;
14660 if (SDValue R = TryMOVIWithBits(UndefBits))
14661 return R;
14662
14663 // See if a fneg of the constant can be materialized with a MOVI, etc
14664 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
14665 // FNegate each sub-element of the constant
14666 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
14667 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
14668 .zext(VT.getSizeInBits());
14669 APInt NegBits(VT.getSizeInBits(), 0);
14670 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
14671 for (unsigned i = 0; i < NumElts; i++)
14672 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
14673 NegBits = DefBits ^ NegBits;
14674
14675 // Try to create the new constants with MOVI, and if so generate a fneg
14676 // for it.
14677 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
14678 SDLoc DL(Op);
14679 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
14680 return DAG.getNode(
14682 DAG.getNode(ISD::FNEG, DL, VFVT,
14683 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
14684 }
14685 return SDValue();
14686 };
14687 SDValue R;
14688 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
14689 (R = TryWithFNeg(DefBits, MVT::f64)) ||
14690 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
14691 return R;
14692 }
14693
14694 return SDValue();
14695}
14696
14697SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
14698 SDValue Op, SelectionDAG &DAG) const {
14699 EVT VT = Op.getValueType();
14700 SDLoc DL(Op);
14701 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
14702 auto *BVN = cast<BuildVectorSDNode>(Op);
14703
14704 if (auto SeqInfo = BVN->isConstantSequence()) {
14705 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
14706 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
14707 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
14708 return convertFromScalableVector(DAG, VT, Seq);
14709 }
14710
14711 unsigned NumElems = VT.getVectorNumElements();
14712 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
14713 NumElems <= 1 || BVN->isConstant())
14714 return SDValue();
14715
14716 auto IsExtractElt = [](SDValue Op) {
14717 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
14718 };
14719
14720 // For integer types that are not already in vectors limit to at most four
14721 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
14722 if (VT.getScalarType().isInteger() &&
14723 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
14724 return SDValue();
14725
14726 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
14727 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
14728 SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
14729 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
14730 return Op.isUndef() ? Undef
14731 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
14732 ContainerVT, Undef, Op, ZeroI64);
14733 });
14734
14735 ElementCount ZipEC = ContainerVT.getVectorElementCount();
14736 while (Intermediates.size() > 1) {
14737 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
14738
14739 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
14740 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
14741 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
14742 Intermediates[I / 2] =
14743 Op1.isUndef() ? Op0
14744 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
14745 }
14746
14747 Intermediates.resize(Intermediates.size() / 2);
14748 ZipEC = ZipEC.divideCoefficientBy(2);
14749 }
14750
14751 assert(Intermediates.size() == 1);
14752 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
14753 return convertFromScalableVector(DAG, VT, Vec);
14754}
14755
14756SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
14757 SelectionDAG &DAG) const {
14758 EVT VT = Op.getValueType();
14759
14760 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14761 cast<BuildVectorSDNode>(Op)->isConstantSequence();
14762 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
14763 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
14764
14765 // Try to build a simple constant vector.
14766 Op = NormalizeBuildVector(Op, DAG);
14767 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
14768 // abort.
14769 if (Op.getOpcode() != ISD::BUILD_VECTOR)
14770 return SDValue();
14771
14772 // Certain vector constants, used to express things like logical NOT and
14773 // arithmetic NEG, are passed through unmodified. This allows special
14774 // patterns for these operations to match, which will lower these constants
14775 // to whatever is proven necessary.
14776 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14777 if (BVN->isConstant()) {
14778 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
14779 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
14780 APInt Val(BitSize,
14781 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
14782 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
14783 return Op;
14784 }
14785 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
14786 if (Const->isZero() && !Const->isNegative())
14787 return Op;
14788 }
14789
14790 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
14791 return V;
14792
14793 // Scan through the operands to find some interesting properties we can
14794 // exploit:
14795 // 1) If only one value is used, we can use a DUP, or
14796 // 2) if only the low element is not undef, we can just insert that, or
14797 // 3) if only one constant value is used (w/ some non-constant lanes),
14798 // we can splat the constant value into the whole vector then fill
14799 // in the non-constant lanes.
14800 // 4) FIXME: If different constant values are used, but we can intelligently
14801 // select the values we'll be overwriting for the non-constant
14802 // lanes such that we can directly materialize the vector
14803 // some other way (MOVI, e.g.), we can be sneaky.
14804 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
14805 SDLoc dl(Op);
14806 unsigned NumElts = VT.getVectorNumElements();
14807 bool isOnlyLowElement = true;
14808 bool usesOnlyOneValue = true;
14809 bool usesOnlyOneConstantValue = true;
14810 bool isConstant = true;
14811 bool AllLanesExtractElt = true;
14812 unsigned NumConstantLanes = 0;
14813 unsigned NumDifferentLanes = 0;
14814 unsigned NumUndefLanes = 0;
14815 SDValue Value;
14816 SDValue ConstantValue;
14817 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14818 unsigned ConsecutiveValCount = 0;
14819 SDValue PrevVal;
14820 for (unsigned i = 0; i < NumElts; ++i) {
14821 SDValue V = Op.getOperand(i);
14822 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14823 AllLanesExtractElt = false;
14824 if (V.isUndef()) {
14825 ++NumUndefLanes;
14826 continue;
14827 }
14828 if (i > 0)
14829 isOnlyLowElement = false;
14830 if (!isIntOrFPConstant(V))
14831 isConstant = false;
14832
14833 if (isIntOrFPConstant(V)) {
14834 ++NumConstantLanes;
14835 if (!ConstantValue.getNode())
14836 ConstantValue = V;
14837 else if (ConstantValue != V)
14838 usesOnlyOneConstantValue = false;
14839 }
14840
14841 if (!Value.getNode())
14842 Value = V;
14843 else if (V != Value) {
14844 usesOnlyOneValue = false;
14845 ++NumDifferentLanes;
14846 }
14847
14848 if (PrevVal != V) {
14849 ConsecutiveValCount = 0;
14850 PrevVal = V;
14851 }
14852
14853 // Keep different values and its last consecutive count. For example,
14854 //
14855 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14856 // t24, t24, t24, t24, t24, t24, t24, t24
14857 // t23 = consecutive count 8
14858 // t24 = consecutive count 8
14859 // ------------------------------------------------------------------
14860 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14861 // t24, t24, t24, t24, t24, t24, t24, t24
14862 // t23 = consecutive count 5
14863 // t24 = consecutive count 9
14864 DifferentValueMap[V] = ++ConsecutiveValCount;
14865 }
14866
14867 if (!Value.getNode()) {
14868 LLVM_DEBUG(
14869 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14870 return DAG.getUNDEF(VT);
14871 }
14872
14873 // Convert BUILD_VECTOR where all elements but the lowest are undef into
14874 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14875 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14876 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
14877 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14878 "SCALAR_TO_VECTOR node\n");
14879 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
14880 }
14881
14882 if (AllLanesExtractElt) {
14883 SDNode *Vector = nullptr;
14884 bool Even = false;
14885 bool Odd = false;
14886 // Check whether the extract elements match the Even pattern <0,2,4,...> or
14887 // the Odd pattern <1,3,5,...>.
14888 for (unsigned i = 0; i < NumElts; ++i) {
14889 SDValue V = Op.getOperand(i);
14890 const SDNode *N = V.getNode();
14891 if (!isa<ConstantSDNode>(N->getOperand(1))) {
14892 Even = false;
14893 Odd = false;
14894 break;
14895 }
14896 SDValue N0 = N->getOperand(0);
14897
14898 // All elements are extracted from the same vector.
14899 if (!Vector) {
14900 Vector = N0.getNode();
14901 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14902 // BUILD_VECTOR.
14903 if (VT.getVectorElementType() !=
14905 break;
14906 } else if (Vector != N0.getNode()) {
14907 Odd = false;
14908 Even = false;
14909 break;
14910 }
14911
14912 // Extracted values are either at Even indices <0,2,4,...> or at Odd
14913 // indices <1,3,5,...>.
14914 uint64_t Val = N->getConstantOperandVal(1);
14915 if (Val == 2 * i) {
14916 Even = true;
14917 continue;
14918 }
14919 if (Val - 1 == 2 * i) {
14920 Odd = true;
14921 continue;
14922 }
14923
14924 // Something does not match: abort.
14925 Odd = false;
14926 Even = false;
14927 break;
14928 }
14929 if (Even || Odd) {
14930 SDValue LHS =
14932 DAG.getConstant(0, dl, MVT::i64));
14933 SDValue RHS =
14935 DAG.getConstant(NumElts, dl, MVT::i64));
14936
14937 if (Even && !Odd)
14938 return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
14939 if (Odd && !Even)
14940 return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
14941 }
14942 }
14943
14944 // Use DUP for non-constant splats. For f32 constant splats, reduce to
14945 // i32 and try again.
14946 if (usesOnlyOneValue) {
14947 if (!isConstant) {
14948 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14949 Value.getValueType() != VT) {
14950 LLVM_DEBUG(
14951 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14952 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
14953 }
14954
14955 // This is actually a DUPLANExx operation, which keeps everything vectory.
14956
14957 SDValue Lane = Value.getOperand(1);
14958 Value = Value.getOperand(0);
14959 if (Value.getValueSizeInBits() == 64) {
14960 LLVM_DEBUG(
14961 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14962 "widening it\n");
14963 Value = WidenVector(Value, DAG);
14964 }
14965
14966 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
14967 return DAG.getNode(Opcode, dl, VT, Value, Lane);
14968 }
14969
14972 EVT EltTy = VT.getVectorElementType();
14973 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
14974 EltTy == MVT::f64) && "Unsupported floating-point vector type");
14975 LLVM_DEBUG(
14976 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14977 "BITCASTS, and try again\n");
14978 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
14979 for (unsigned i = 0; i < NumElts; ++i)
14980 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
14981 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
14982 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
14983 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14984 Val.dump(););
14985 Val = LowerBUILD_VECTOR(Val, DAG);
14986 if (Val.getNode())
14987 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
14988 }
14989 }
14990
14991 // If we need to insert a small number of different non-constant elements and
14992 // the vector width is sufficiently large, prefer using DUP with the common
14993 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14994 // skip the constant lane handling below.
14995 bool PreferDUPAndInsert =
14996 !isConstant && NumDifferentLanes >= 1 &&
14997 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
14998 NumDifferentLanes >= NumConstantLanes;
14999
15000 // If there was only one constant value used and for more than one lane,
15001 // start by splatting that value, then replace the non-constant lanes. This
15002 // is better than the default, which will perform a separate initialization
15003 // for each lane.
15004 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15005 // Firstly, try to materialize the splat constant.
15006 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
15007 unsigned BitSize = VT.getScalarSizeInBits();
15008 APInt ConstantValueAPInt(1, 0);
15009 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15010 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15011 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15012 !ConstantValueAPInt.isAllOnes()) {
15013 Val = ConstantBuildVector(Val, DAG, Subtarget);
15014 if (!Val)
15015 // Otherwise, materialize the constant and splat it.
15016 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
15017 }
15018
15019 // Now insert the non-constant lanes.
15020 for (unsigned i = 0; i < NumElts; ++i) {
15021 SDValue V = Op.getOperand(i);
15022 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
15023 if (!isIntOrFPConstant(V))
15024 // Note that type legalization likely mucked about with the VT of the
15025 // source operand, so we may have to convert it here before inserting.
15026 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
15027 }
15028 return Val;
15029 }
15030
15031 // This will generate a load from the constant pool.
15032 if (isConstant) {
15033 LLVM_DEBUG(
15034 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15035 "expansion\n");
15036 return SDValue();
15037 }
15038
15039 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15040 // v4i32s. This is really a truncate, which we can construct out of (legal)
15041 // concats and truncate nodes.
15043 return M;
15044
15045 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15046 if (NumElts >= 4) {
15047 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15048 return Shuffle;
15049
15050 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15051 return Shuffle;
15052 }
15053
15054 if (PreferDUPAndInsert) {
15055 // First, build a constant vector with the common element.
15056 SmallVector<SDValue, 8> Ops(NumElts, Value);
15057 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
15058 // Next, insert the elements that do not match the common value.
15059 for (unsigned I = 0; I < NumElts; ++I)
15060 if (Op.getOperand(I) != Value)
15061 NewVector =
15062 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
15063 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
15064
15065 return NewVector;
15066 }
15067
15068 // If vector consists of two different values, try to generate two DUPs and
15069 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15070 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15072 // Check the consecutive count of the value is the half number of vector
15073 // elements. In this case, we can use CONCAT_VECTORS. For example,
15074 //
15075 // canUseVECTOR_CONCAT = true;
15076 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15077 // t24, t24, t24, t24, t24, t24, t24, t24
15078 //
15079 // canUseVECTOR_CONCAT = false;
15080 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15081 // t24, t24, t24, t24, t24, t24, t24, t24
15082 bool canUseVECTOR_CONCAT = true;
15083 for (auto Pair : DifferentValueMap) {
15084 // Check different values have same length which is NumElts / 2.
15085 if (Pair.second != NumElts / 2)
15086 canUseVECTOR_CONCAT = false;
15087 Vals.push_back(Pair.first);
15088 }
15089
15090 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15091 // CONCAT_VECTORs. For example,
15092 //
15093 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15094 // t24, t24, t24, t24, t24, t24, t24, t24
15095 // ==>
15096 // t26: v8i8 = AArch64ISD::DUP t23
15097 // t28: v8i8 = AArch64ISD::DUP t24
15098 // t29: v16i8 = concat_vectors t26, t28
15099 if (canUseVECTOR_CONCAT) {
15100 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15101 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15102 SubVT.getVectorNumElements() >= 2) {
15103 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15104 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15105 SDValue DUP1 =
15106 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
15107 SDValue DUP2 =
15108 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
15110 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
15111 return CONCAT_VECTORS;
15112 }
15113 }
15114
15115 // Let's try to generate VECTOR_SHUFFLE. For example,
15116 //
15117 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15118 // ==>
15119 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15120 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15121 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15122 if (NumElts >= 8) {
15123 SmallVector<int, 16> MaskVec;
15124 // Build mask for VECTOR_SHUFLLE.
15125 SDValue FirstLaneVal = Op.getOperand(0);
15126 for (unsigned i = 0; i < NumElts; ++i) {
15127 SDValue Val = Op.getOperand(i);
15128 if (FirstLaneVal == Val)
15129 MaskVec.push_back(i);
15130 else
15131 MaskVec.push_back(i + NumElts);
15132 }
15133
15134 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15135 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15136 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
15137 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
15139 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
15140 return VECTOR_SHUFFLE;
15141 }
15142 }
15143
15144 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15145 // know the default expansion would otherwise fall back on something even
15146 // worse. For a vector with one or two non-undef values, that's
15147 // scalar_to_vector for the elements followed by a shuffle (provided the
15148 // shuffle is valid for the target) and materialization element by element
15149 // on the stack followed by a load for everything else.
15150 if (!isConstant && !usesOnlyOneValue) {
15151 LLVM_DEBUG(
15152 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15153 "of INSERT_VECTOR_ELT\n");
15154
15155 SDValue Vec = DAG.getUNDEF(VT);
15156 SDValue Op0 = Op.getOperand(0);
15157 unsigned i = 0;
15158
15159 // Use SCALAR_TO_VECTOR for lane zero to
15160 // a) Avoid a RMW dependency on the full vector register, and
15161 // b) Allow the register coalescer to fold away the copy if the
15162 // value is already in an S or D register, and we're forced to emit an
15163 // INSERT_SUBREG that we can't fold anywhere.
15164 //
15165 // We also allow types like i8 and i16 which are illegal scalar but legal
15166 // vector element types. After type-legalization the inserted value is
15167 // extended (i32) and it is safe to cast them to the vector type by ignoring
15168 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15169 if (!Op0.isUndef()) {
15170 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15171 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
15172 ++i;
15173 }
15174 LLVM_DEBUG({
15175 if (i < NumElts)
15176 dbgs() << "Creating nodes for the other vector elements:\n";
15177 });
15178 for (; i < NumElts; ++i) {
15179 SDValue V = Op.getOperand(i);
15180 if (V.isUndef())
15181 continue;
15182 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
15183 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
15184 }
15185 return Vec;
15186 }
15187
15188 LLVM_DEBUG(
15189 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15190 "better alternative\n");
15191 return SDValue();
15192}
15193
15194SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15195 SelectionDAG &DAG) const {
15196 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15197 !Subtarget->isNeonAvailable()))
15198 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15199
15200 assert(Op.getValueType().isScalableVector() &&
15201 isTypeLegal(Op.getValueType()) &&
15202 "Expected legal scalable vector type!");
15203
15204 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15205 unsigned NumOperands = Op->getNumOperands();
15206 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15207 "Unexpected number of operands in CONCAT_VECTORS");
15208
15209 if (NumOperands == 2)
15210 return Op;
15211
15212 // Concat each pair of subvectors and pack into the lower half of the array.
15213 SmallVector<SDValue> ConcatOps(Op->ops());
15214 while (ConcatOps.size() > 1) {
15215 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15216 SDValue V1 = ConcatOps[I];
15217 SDValue V2 = ConcatOps[I + 1];
15218 EVT SubVT = V1.getValueType();
15219 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15220 ConcatOps[I / 2] =
15221 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15222 }
15223 ConcatOps.resize(ConcatOps.size() / 2);
15224 }
15225 return ConcatOps[0];
15226 }
15227
15228 return SDValue();
15229}
15230
15231SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15232 SelectionDAG &DAG) const {
15233 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15234
15235 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15236 !Subtarget->isNeonAvailable()))
15237 return LowerFixedLengthInsertVectorElt(Op, DAG);
15238
15239 EVT VT = Op.getOperand(0).getValueType();
15240
15241 if (VT.getScalarType() == MVT::i1) {
15242 EVT VectorVT = getPromotedVTForPredicate(VT);
15243 SDLoc DL(Op);
15244 SDValue ExtendedVector =
15245 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15246 SDValue ExtendedValue =
15247 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15248 VectorVT.getScalarType().getSizeInBits() < 32
15249 ? MVT::i32
15250 : VectorVT.getScalarType());
15251 ExtendedVector =
15252 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15253 ExtendedValue, Op.getOperand(2));
15254 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15255 }
15256
15257 // Check for non-constant or out of range lane.
15258 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15259 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15260 return SDValue();
15261
15262 return Op;
15263}
15264
15265SDValue
15266AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15267 SelectionDAG &DAG) const {
15268 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15269 EVT VT = Op.getOperand(0).getValueType();
15270
15271 if (VT.getScalarType() == MVT::i1) {
15272 // We can't directly extract from an SVE predicate; extend it first.
15273 // (This isn't the only possible lowering, but it's straightforward.)
15274 EVT VectorVT = getPromotedVTForPredicate(VT);
15275 SDLoc DL(Op);
15276 SDValue Extend =
15277 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15278 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15279 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15280 Extend, Op.getOperand(1));
15281 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15282 }
15283
15284 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15285 return LowerFixedLengthExtractVectorElt(Op, DAG);
15286
15287 // Check for non-constant or out of range lane.
15288 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15289 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15290 return SDValue();
15291
15292 // Insertion/extraction are legal for V128 types.
15293 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15294 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15295 VT == MVT::v8f16 || VT == MVT::v8bf16)
15296 return Op;
15297
15298 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15299 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15300 VT != MVT::v4bf16)
15301 return SDValue();
15302
15303 // For V64 types, we perform extraction by expanding the value
15304 // to a V128 type and perform the extraction on that.
15305 SDLoc DL(Op);
15306 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15307 EVT WideTy = WideVec.getValueType();
15308
15309 EVT ExtrTy = WideTy.getVectorElementType();
15310 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15311 ExtrTy = MVT::i32;
15312
15313 // For extractions, we just return the result directly.
15314 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15315 Op.getOperand(1));
15316}
15317
15318SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15319 SelectionDAG &DAG) const {
15320 EVT VT = Op.getValueType();
15322 "Only cases that extract a fixed length vector are supported!");
15323 EVT InVT = Op.getOperand(0).getValueType();
15324
15325 // If we don't have legal types yet, do nothing
15326 if (!isTypeLegal(InVT))
15327 return SDValue();
15328
15329 if (InVT.is128BitVector()) {
15330 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15331 unsigned Idx = Op.getConstantOperandVal(1);
15332
15333 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15334 if (Idx == 0)
15335 return Op;
15336
15337 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15338 // that directly.
15339 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15340 return Op;
15341 }
15342
15343 if (InVT.isScalableVector() ||
15344 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
15345 SDLoc DL(Op);
15346 SDValue Vec = Op.getOperand(0);
15347 SDValue Idx = Op.getOperand(1);
15348
15350 if (PackedVT != InVT) {
15351 // Pack input into the bottom part of an SVE register and try again.
15352 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
15353 DAG.getUNDEF(PackedVT), Vec,
15354 DAG.getVectorIdxConstant(0, DL));
15355 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
15356 }
15357
15358 // This will get matched by custom code during ISelDAGToDAG.
15359 if (isNullConstant(Idx))
15360 return Op;
15361
15362 assert(InVT.isScalableVector() && "Unexpected vector type!");
15363 // Move requested subvector to the start of the vector and try again.
15364 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
15365 return convertFromScalableVector(DAG, VT, Splice);
15366 }
15367
15368 return SDValue();
15369}
15370
15371SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15372 SelectionDAG &DAG) const {
15373 assert(Op.getValueType().isScalableVector() &&
15374 "Only expect to lower inserts into scalable vectors!");
15375
15376 EVT InVT = Op.getOperand(1).getValueType();
15377 unsigned Idx = Op.getConstantOperandVal(2);
15378
15379 SDValue Vec0 = Op.getOperand(0);
15380 SDValue Vec1 = Op.getOperand(1);
15381 SDLoc DL(Op);
15382 EVT VT = Op.getValueType();
15383
15384 if (InVT.isScalableVector()) {
15385 if (!isTypeLegal(VT))
15386 return SDValue();
15387
15388 // Break down insert_subvector into simpler parts.
15389 if (VT.getVectorElementType() == MVT::i1) {
15390 unsigned NumElts = VT.getVectorMinNumElements();
15391 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15392
15393 SDValue Lo, Hi;
15394 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15395 DAG.getVectorIdxConstant(0, DL));
15396 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15397 DAG.getVectorIdxConstant(NumElts / 2, DL));
15398 if (Idx < (NumElts / 2))
15399 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
15401 else
15402 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
15403 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
15404
15405 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15406 }
15407
15408 // We can select these directly.
15409 if (isTypeLegal(InVT) && Vec0.isUndef())
15410 return Op;
15411
15412 // Ensure the subvector is half the size of the main vector.
15413 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15414 return SDValue();
15415
15416 // Here narrow and wide refers to the vector element types. After "casting"
15417 // both vectors must have the same bit length and so because the subvector
15418 // has fewer elements, those elements need to be bigger.
15421
15422 // NOP cast operands to the largest legal vector of the same element count.
15423 if (VT.isFloatingPoint()) {
15424 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
15425 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
15426 } else {
15427 // Legal integer vectors are already their largest so Vec0 is fine as is.
15428 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
15429 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
15430 }
15431
15432 // To replace the top/bottom half of vector V with vector SubV we widen the
15433 // preserved half of V, concatenate this to SubV (the order depending on the
15434 // half being replaced) and then narrow the result.
15435 SDValue Narrow;
15436 if (Idx == 0) {
15437 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
15438 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
15439 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
15440 } else {
15442 "Invalid subvector index!");
15443 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
15444 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
15445 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
15446 }
15447
15448 return getSVESafeBitCast(VT, Narrow, DAG);
15449 }
15450
15451 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15452 // This will be matched by custom code during ISelDAGToDAG.
15453 if (Vec0.isUndef())
15454 return Op;
15455
15456 std::optional<unsigned> PredPattern =
15458 auto PredTy = VT.changeVectorElementType(MVT::i1);
15459 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
15460 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
15461 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
15462 }
15463
15464 return SDValue();
15465}
15466
15467static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15468 if (Op.getOpcode() != AArch64ISD::DUP &&
15469 Op.getOpcode() != ISD::SPLAT_VECTOR &&
15470 Op.getOpcode() != ISD::BUILD_VECTOR)
15471 return false;
15472
15473 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15474 !isAllConstantBuildVector(Op, SplatVal))
15475 return false;
15476
15477 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15478 !isa<ConstantSDNode>(Op->getOperand(0)))
15479 return false;
15480
15481 SplatVal = Op->getConstantOperandVal(0);
15482 if (Op.getValueType().getVectorElementType() != MVT::i64)
15483 SplatVal = (int32_t)SplatVal;
15484
15485 Negated = false;
15486 if (isPowerOf2_64(SplatVal))
15487 return true;
15488
15489 Negated = true;
15490 if (isPowerOf2_64(-SplatVal)) {
15491 SplatVal = -SplatVal;
15492 return true;
15493 }
15494
15495 return false;
15496}
15497
15498SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15499 EVT VT = Op.getValueType();
15500 SDLoc dl(Op);
15501
15502 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
15503 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15504
15505 assert(VT.isScalableVector() && "Expected a scalable vector.");
15506
15507 bool Signed = Op.getOpcode() == ISD::SDIV;
15508 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15509
15510 bool Negated;
15511 uint64_t SplatVal;
15512 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
15513 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
15514 SDValue Res =
15515 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
15516 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
15517 if (Negated)
15518 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
15519
15520 return Res;
15521 }
15522
15523 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
15524 return LowerToPredicatedOp(Op, DAG, PredOpcode);
15525
15526 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15527 // operations, and truncate the result.
15528 EVT WidenedVT;
15529 if (VT == MVT::nxv16i8)
15530 WidenedVT = MVT::nxv8i16;
15531 else if (VT == MVT::nxv8i16)
15532 WidenedVT = MVT::nxv4i32;
15533 else
15534 llvm_unreachable("Unexpected Custom DIV operation");
15535
15536 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15537 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15538 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
15539 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
15540 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
15541 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
15542 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
15543 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
15544 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultLo);
15545 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultHi);
15546 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLoCast, ResultHiCast);
15547}
15548
15549bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15550 EVT VT, unsigned DefinedValues) const {
15551 if (!Subtarget->isNeonAvailable())
15552 return false;
15554}
15555
15557 // Currently no fixed length shuffles that require SVE are legal.
15558 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15559 return false;
15560
15561 if (VT.getVectorNumElements() == 4 &&
15562 (VT.is128BitVector() || VT.is64BitVector())) {
15563 unsigned Cost = getPerfectShuffleCost(M);
15564 if (Cost <= 1)
15565 return true;
15566 }
15567
15568 bool DummyBool;
15569 int DummyInt;
15570 unsigned DummyUnsigned;
15571
15572 unsigned EltSize = VT.getScalarSizeInBits();
15573 unsigned NumElts = VT.getVectorNumElements();
15574 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
15575 isREVMask(M, EltSize, NumElts, 64) ||
15576 isREVMask(M, EltSize, NumElts, 32) ||
15577 isREVMask(M, EltSize, NumElts, 16) ||
15578 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
15579 isTRNMask(M, NumElts, DummyUnsigned) ||
15580 isUZPMask(M, NumElts, DummyUnsigned) ||
15581 isZIPMask(M, NumElts, DummyUnsigned) ||
15582 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
15583 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
15584 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
15585 isINSMask(M, NumElts, DummyBool, DummyInt) ||
15586 isConcatMask(M, VT, VT.getSizeInBits() == 128));
15587}
15588
15590 EVT VT) const {
15591 // Just delegate to the generic legality, clear masks aren't special.
15592 return isShuffleMaskLegal(M, VT);
15593}
15594
15595/// getVShiftImm - Check if this is a valid build_vector for the immediate
15596/// operand of a vector shift operation, where all the elements of the
15597/// build_vector must have the same constant integer value.
15598static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15599 // Ignore bit_converts.
15600 while (Op.getOpcode() == ISD::BITCAST)
15601 Op = Op.getOperand(0);
15602 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
15603 APInt SplatBits, SplatUndef;
15604 unsigned SplatBitSize;
15605 bool HasAnyUndefs;
15606 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
15607 HasAnyUndefs, ElementBits) ||
15608 SplatBitSize > ElementBits)
15609 return false;
15610 Cnt = SplatBits.getSExtValue();
15611 return true;
15612}
15613
15614/// isVShiftLImm - Check if this is a valid build_vector for the immediate
15615/// operand of a vector shift left operation. That value must be in the range:
15616/// 0 <= Value < ElementBits for a left shift; or
15617/// 0 <= Value <= ElementBits for a long left shift.
15618static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
15619 assert(VT.isVector() && "vector shift count is not a vector type");
15620 int64_t ElementBits = VT.getScalarSizeInBits();
15621 if (!getVShiftImm(Op, ElementBits, Cnt))
15622 return false;
15623 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
15624}
15625
15626/// isVShiftRImm - Check if this is a valid build_vector for the immediate
15627/// operand of a vector shift right operation. The value must be in the range:
15628/// 1 <= Value <= ElementBits for a right shift; or
15629static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
15630 assert(VT.isVector() && "vector shift count is not a vector type");
15631 int64_t ElementBits = VT.getScalarSizeInBits();
15632 if (!getVShiftImm(Op, ElementBits, Cnt))
15633 return false;
15634 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
15635}
15636
15637SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
15638 SelectionDAG &DAG) const {
15639 EVT VT = Op.getValueType();
15640
15641 if (VT.getScalarType() == MVT::i1) {
15642 // Lower i1 truncate to `(x & 1) != 0`.
15643 SDLoc dl(Op);
15644 EVT OpVT = Op.getOperand(0).getValueType();
15645 SDValue Zero = DAG.getConstant(0, dl, OpVT);
15646 SDValue One = DAG.getConstant(1, dl, OpVT);
15647 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
15648 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
15649 }
15650
15651 if (!VT.isVector() || VT.isScalableVector())
15652 return SDValue();
15653
15654 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15655 !Subtarget->isNeonAvailable()))
15656 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
15657
15658 return SDValue();
15659}
15660
15661// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
15662// possibly a truncated type, it tells how many bits of the value are to be
15663// used.
15665 SelectionDAG &DAG,
15666 unsigned &ShiftValue,
15667 SDValue &RShOperand) {
15668 if (Shift->getOpcode() != ISD::SRL)
15669 return false;
15670
15671 EVT VT = Shift.getValueType();
15672 assert(VT.isScalableVT());
15673
15674 auto ShiftOp1 =
15675 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
15676 if (!ShiftOp1)
15677 return false;
15678
15679 ShiftValue = ShiftOp1->getZExtValue();
15680 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
15681 return false;
15682
15683 SDValue Add = Shift->getOperand(0);
15684 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
15685 return false;
15686
15688 "ResVT must be truncated or same type as the shift.");
15689 // Check if an overflow can lead to incorrect results.
15690 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
15691 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
15692 return false;
15693
15694 auto AddOp1 =
15695 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
15696 if (!AddOp1)
15697 return false;
15698 uint64_t AddValue = AddOp1->getZExtValue();
15699 if (AddValue != 1ULL << (ShiftValue - 1))
15700 return false;
15701
15702 RShOperand = Add->getOperand(0);
15703 return true;
15704}
15705
15706SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
15707 SelectionDAG &DAG) const {
15708 EVT VT = Op.getValueType();
15709 SDLoc DL(Op);
15710 int64_t Cnt;
15711
15712 if (!Op.getOperand(1).getValueType().isVector())
15713 return Op;
15714 unsigned EltSize = VT.getScalarSizeInBits();
15715
15716 switch (Op.getOpcode()) {
15717 case ISD::SHL:
15718 if (VT.isScalableVector() ||
15720 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
15721
15722 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
15723 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
15724 DAG.getConstant(Cnt, DL, MVT::i32));
15725 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
15726 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
15727 MVT::i32),
15728 Op.getOperand(0), Op.getOperand(1));
15729 case ISD::SRA:
15730 case ISD::SRL:
15731 if (VT.isScalableVector() &&
15732 (Subtarget->hasSVE2() ||
15733 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
15734 SDValue RShOperand;
15735 unsigned ShiftValue;
15736 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
15737 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
15738 getPredicateForVector(DAG, DL, VT), RShOperand,
15739 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
15740 }
15741
15742 if (VT.isScalableVector() ||
15743 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
15744 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
15746 return LowerToPredicatedOp(Op, DAG, Opc);
15747 }
15748
15749 // Right shift immediate
15750 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
15751 unsigned Opc =
15752 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
15753 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
15754 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
15755 }
15756
15757 // Right shift register. Note, there is not a shift right register
15758 // instruction, but the shift left register instruction takes a signed
15759 // value, where negative numbers specify a right shift.
15760 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
15761 : Intrinsic::aarch64_neon_ushl;
15762 // negate the shift amount
15763 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
15764 Op.getOperand(1));
15765 SDValue NegShiftLeft =
15767 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
15768 NegShift);
15769 return NegShiftLeft;
15770 }
15771
15772 llvm_unreachable("unexpected shift opcode");
15773}
15774
15776 AArch64CC::CondCode CC, bool NoNans, EVT VT,
15777 const SDLoc &dl, SelectionDAG &DAG) {
15778 EVT SrcVT = LHS.getValueType();
15779 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
15780 "function only supposed to emit natural comparisons");
15781
15782 APInt SplatValue;
15783 APInt SplatUndef;
15784 unsigned SplatBitSize = 0;
15785 bool HasAnyUndefs;
15786
15787 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15788 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
15789 SplatBitSize, HasAnyUndefs);
15790
15791 bool IsZero = IsCnst && SplatValue == 0;
15792 bool IsOne =
15793 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
15794 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
15795
15796 if (SrcVT.getVectorElementType().isFloatingPoint()) {
15797 switch (CC) {
15798 default:
15799 return SDValue();
15800 case AArch64CC::NE: {
15801 SDValue Fcmeq;
15802 if (IsZero)
15803 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15804 else
15805 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15806 return DAG.getNOT(dl, Fcmeq, VT);
15807 }
15808 case AArch64CC::EQ:
15809 if (IsZero)
15810 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15811 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15812 case AArch64CC::GE:
15813 if (IsZero)
15814 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
15815 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
15816 case AArch64CC::GT:
15817 if (IsZero)
15818 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
15819 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15820 case AArch64CC::LE:
15821 if (!NoNans)
15822 return SDValue();
15823 // If we ignore NaNs then we can use to the LS implementation.
15824 [[fallthrough]];
15825 case AArch64CC::LS:
15826 if (IsZero)
15827 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
15828 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15829 case AArch64CC::LT:
15830 if (!NoNans)
15831 return SDValue();
15832 // If we ignore NaNs then we can use to the MI implementation.
15833 [[fallthrough]];
15834 case AArch64CC::MI:
15835 if (IsZero)
15836 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
15837 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15838 }
15839 }
15840
15841 switch (CC) {
15842 default:
15843 return SDValue();
15844 case AArch64CC::NE: {
15845 SDValue Cmeq;
15846 if (IsZero)
15847 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15848 else
15849 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15850 return DAG.getNOT(dl, Cmeq, VT);
15851 }
15852 case AArch64CC::EQ:
15853 if (IsZero)
15854 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15855 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15856 case AArch64CC::GE:
15857 if (IsZero)
15858 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15859 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
15860 case AArch64CC::GT:
15861 if (IsZero)
15862 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
15863 if (IsMinusOne)
15864 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15865 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
15866 case AArch64CC::LE:
15867 if (IsZero)
15868 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15869 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
15870 case AArch64CC::LS:
15871 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
15872 case AArch64CC::LO:
15873 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
15874 case AArch64CC::LT:
15875 if (IsZero)
15876 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
15877 if (IsOne)
15878 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15879 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
15880 case AArch64CC::HI:
15881 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
15882 case AArch64CC::HS:
15883 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
15884 }
15885}
15886
15887SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15888 SelectionDAG &DAG) const {
15889 if (Op.getValueType().isScalableVector())
15890 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
15891
15892 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15893 !Subtarget->isNeonAvailable()))
15894 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15895
15896 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15897 SDValue LHS = Op.getOperand(0);
15898 SDValue RHS = Op.getOperand(1);
15899 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15900 SDLoc dl(Op);
15901
15902 if (LHS.getValueType().getVectorElementType().isInteger()) {
15903 assert(LHS.getValueType() == RHS.getValueType());
15905 SDValue Cmp =
15906 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
15907 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15908 }
15909
15910 // Lower isnan(x) | isnan(never-nan) to x != x.
15911 // Lower !isnan(x) & !isnan(never-nan) to x == x.
15912 if (CC == ISD::SETUO || CC == ISD::SETO) {
15913 bool OneNaN = false;
15914 if (LHS == RHS) {
15915 OneNaN = true;
15916 } else if (DAG.isKnownNeverNaN(RHS)) {
15917 OneNaN = true;
15918 RHS = LHS;
15919 } else if (DAG.isKnownNeverNaN(LHS)) {
15920 OneNaN = true;
15921 LHS = RHS;
15922 }
15923 if (OneNaN) {
15925 }
15926 }
15927
15928 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15929
15930 // Make v4f16 (only) fcmp operations utilise vector instructions
15931 // v8f16 support will be a litle more complicated
15932 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
15933 LHS.getValueType().getVectorElementType() == MVT::bf16) {
15934 if (LHS.getValueType().getVectorNumElements() == 4) {
15935 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
15936 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
15937 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
15938 DAG.ReplaceAllUsesWith(Op, NewSetcc);
15939 CmpVT = MVT::v4i32;
15940 } else
15941 return SDValue();
15942 }
15943
15944 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
15945 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15946 LHS.getValueType().getVectorElementType() != MVT::f128);
15947
15948 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15949 // clean. Some of them require two branches to implement.
15950 AArch64CC::CondCode CC1, CC2;
15951 bool ShouldInvert;
15952 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
15953
15954 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15955 SDValue Cmp =
15956 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
15957 if (!Cmp.getNode())
15958 return SDValue();
15959
15960 if (CC2 != AArch64CC::AL) {
15961 SDValue Cmp2 =
15962 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
15963 if (!Cmp2.getNode())
15964 return SDValue();
15965
15966 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
15967 }
15968
15969 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15970
15971 if (ShouldInvert)
15972 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
15973
15974 return Cmp;
15975}
15976
15977static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15978 SelectionDAG &DAG) {
15979 SDValue VecOp = ScalarOp.getOperand(0);
15980 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
15981 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
15982 DAG.getConstant(0, DL, MVT::i64));
15983}
15984
15985static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15986 SDLoc DL, SelectionDAG &DAG) {
15987 unsigned ScalarOpcode;
15988 switch (Opcode) {
15989 case ISD::VECREDUCE_AND:
15990 ScalarOpcode = ISD::AND;
15991 break;
15992 case ISD::VECREDUCE_OR:
15993 ScalarOpcode = ISD::OR;
15994 break;
15995 case ISD::VECREDUCE_XOR:
15996 ScalarOpcode = ISD::XOR;
15997 break;
15998 default:
15999 llvm_unreachable("Expected bitwise vector reduction");
16000 return SDValue();
16001 }
16002
16003 EVT VecVT = Vec.getValueType();
16004 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16005 "Expected power-of-2 length vector");
16006
16007 EVT ElemVT = VecVT.getVectorElementType();
16008
16009 SDValue Result;
16010 unsigned NumElems = VecVT.getVectorNumElements();
16011
16012 // Special case for boolean reductions
16013 if (ElemVT == MVT::i1) {
16014 // Split large vectors into smaller ones
16015 if (NumElems > 16) {
16016 SDValue Lo, Hi;
16017 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16018 EVT HalfVT = Lo.getValueType();
16019 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16020 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16021 }
16022
16023 // Results of setcc operations get widened to 128 bits if their input
16024 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16025 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16026 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16027 // size leads to the best codegen, since e.g. setcc results might need to be
16028 // truncated otherwise.
16029 unsigned ExtendedWidth = 64;
16030 if (Vec.getOpcode() == ISD::SETCC &&
16031 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16032 ExtendedWidth = 128;
16033 }
16034 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16035
16036 // any_ext doesn't work with umin/umax, so only use it for uadd.
16037 unsigned ExtendOp =
16038 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16039 SDValue Extended = DAG.getNode(
16040 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16041 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16042 // in that case we bitcast the sign extended values from v2i64 to v4i32
16043 // before reduction for optimal code generation.
16044 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16045 NumElems == 2 && ExtendedWidth == 128) {
16046 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16047 ExtendedVT = MVT::i32;
16048 }
16049 switch (ScalarOpcode) {
16050 case ISD::AND:
16051 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16052 break;
16053 case ISD::OR:
16054 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16055 break;
16056 case ISD::XOR:
16057 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16058 break;
16059 default:
16060 llvm_unreachable("Unexpected Opcode");
16061 }
16062
16063 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16064 } else {
16065 // Iteratively split the vector in half and combine using the bitwise
16066 // operation until it fits in a 64 bit register.
16067 while (VecVT.getSizeInBits() > 64) {
16068 SDValue Lo, Hi;
16069 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16070 VecVT = Lo.getValueType();
16071 NumElems = VecVT.getVectorNumElements();
16072 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16073 }
16074
16075 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16076
16077 // Do the remaining work on a scalar since it allows the code generator to
16078 // combine the shift and bitwise operation into one instruction and since
16079 // integer instructions can have higher throughput than vector instructions.
16080 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16081
16082 // Iteratively combine the lower and upper halves of the scalar using the
16083 // bitwise operation, halving the relevant region of the scalar in each
16084 // iteration, until the relevant region is just one element of the original
16085 // vector.
16086 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16087 SDValue ShiftAmount =
16088 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16089 SDValue Shifted =
16090 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16091 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16092 }
16093
16094 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16095 }
16096
16097 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16098}
16099
16100SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16101 SelectionDAG &DAG) const {
16102 SDValue Src = Op.getOperand(0);
16103
16104 // Try to lower fixed length reductions to SVE.
16105 EVT SrcVT = Src.getValueType();
16106 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16107 Op.getOpcode() == ISD::VECREDUCE_AND ||
16108 Op.getOpcode() == ISD::VECREDUCE_OR ||
16109 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16110 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16111 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16112 SrcVT.getVectorElementType() == MVT::i64);
16113 if (SrcVT.isScalableVector() ||
16115 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16116
16117 if (SrcVT.getVectorElementType() == MVT::i1)
16118 return LowerPredReductionToSVE(Op, DAG);
16119
16120 switch (Op.getOpcode()) {
16121 case ISD::VECREDUCE_ADD:
16122 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16123 case ISD::VECREDUCE_AND:
16124 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16125 case ISD::VECREDUCE_OR:
16126 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16128 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16130 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16132 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16134 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16135 case ISD::VECREDUCE_XOR:
16136 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16138 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16140 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16142 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16144 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16146 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16147 default:
16148 llvm_unreachable("Unhandled fixed length reduction");
16149 }
16150 }
16151
16152 // Lower NEON reductions.
16153 SDLoc dl(Op);
16154 switch (Op.getOpcode()) {
16155 case ISD::VECREDUCE_AND:
16156 case ISD::VECREDUCE_OR:
16157 case ISD::VECREDUCE_XOR:
16158 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16159 Op.getValueType(), dl, DAG);
16160 case ISD::VECREDUCE_ADD:
16161 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
16163 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
16165 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
16167 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
16169 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
16170 default:
16171 llvm_unreachable("Unhandled reduction");
16172 }
16173}
16174
16175SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16176 SelectionDAG &DAG) const {
16177 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16178 // No point replacing if we don't have the relevant instruction/libcall anyway
16179 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16180 return SDValue();
16181
16182 // LSE has an atomic load-clear instruction, but not a load-and.
16183 SDLoc dl(Op);
16184 MVT VT = Op.getSimpleValueType();
16185 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16186 SDValue RHS = Op.getOperand(2);
16187 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16188 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getAllOnesConstant(dl, VT), RHS);
16189 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
16190 Op.getOperand(0), Op.getOperand(1), RHS,
16191 AN->getMemOperand());
16192}
16193
16194SDValue
16195AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16196 SelectionDAG &DAG) const {
16197
16198 SDLoc dl(Op);
16199 // Get the inputs.
16200 SDNode *Node = Op.getNode();
16201 SDValue Chain = Op.getOperand(0);
16202 SDValue Size = Op.getOperand(1);
16204 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16205 EVT VT = Node->getValueType(0);
16206
16208 "no-stack-arg-probe")) {
16209 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16210 Chain = SP.getValue(1);
16211 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16212 if (Align)
16213 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16214 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16215 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16216 SDValue Ops[2] = {SP, Chain};
16217 return DAG.getMergeValues(Ops, dl);
16218 }
16219
16220 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
16221
16222 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16224 PtrVT, 0);
16225
16226 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16227 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16228 if (Subtarget->hasCustomCallingConv())
16229 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16230
16231 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
16232 DAG.getConstant(4, dl, MVT::i64));
16233 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
16234 Chain =
16235 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
16236 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16237 DAG.getRegisterMask(Mask), Chain.getValue(1));
16238 // To match the actual intent better, we should read the output from X15 here
16239 // again (instead of potentially spilling it to the stack), but rereading Size
16240 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16241 // here.
16242
16243 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
16244 DAG.getConstant(4, dl, MVT::i64));
16245
16246 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16247 Chain = SP.getValue(1);
16248 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16249 if (Align)
16250 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16251 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16252 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16253
16254 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
16255
16256 SDValue Ops[2] = {SP, Chain};
16257 return DAG.getMergeValues(Ops, dl);
16258}
16259
16260SDValue
16261AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16262 SelectionDAG &DAG) const {
16263 // Get the inputs.
16264 SDNode *Node = Op.getNode();
16265 SDValue Chain = Op.getOperand(0);
16266 SDValue Size = Op.getOperand(1);
16267
16269 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16270 SDLoc dl(Op);
16271 EVT VT = Node->getValueType(0);
16272
16273 // Construct the new SP value in a GPR.
16274 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16275 Chain = SP.getValue(1);
16276 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16277 if (Align)
16278 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16279 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16280
16281 // Set the real SP to the new value with a probing loop.
16282 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
16283 SDValue Ops[2] = {SP, Chain};
16284 return DAG.getMergeValues(Ops, dl);
16285}
16286
16287SDValue
16288AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16289 SelectionDAG &DAG) const {
16291
16292 if (Subtarget->isTargetWindows())
16293 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16294 else if (hasInlineStackProbe(MF))
16295 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16296 else
16297 return SDValue();
16298}
16299
16300SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16301 unsigned NewOp) const {
16302 if (Subtarget->hasSVE2())
16303 return LowerToPredicatedOp(Op, DAG, NewOp);
16304
16305 // Default to expand.
16306 return SDValue();
16307}
16308
16309SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16310 SelectionDAG &DAG) const {
16311 EVT VT = Op.getValueType();
16312 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16313
16314 SDLoc DL(Op);
16315 APInt MulImm = Op.getConstantOperandAPInt(0);
16316 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16317 VT);
16318}
16319
16320/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16321template <unsigned NumVecs>
16322static bool
16326 // Retrieve EC from first vector argument.
16327 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16329#ifndef NDEBUG
16330 // Check the assumption that all input vectors are the same type.
16331 for (unsigned I = 0; I < NumVecs; ++I)
16332 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16333 "Invalid type.");
16334#endif
16335 // memVT is `NumVecs * VT`.
16337 EC * NumVecs);
16338 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16339 Info.offset = 0;
16340 Info.align.reset();
16342 return true;
16343}
16344
16345/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16346/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16347/// specified in the intrinsic calls.
16349 const CallInst &I,
16350 MachineFunction &MF,
16351 unsigned Intrinsic) const {
16352 auto &DL = I.getDataLayout();
16353 switch (Intrinsic) {
16354 case Intrinsic::aarch64_sve_st2:
16355 return setInfoSVEStN<2>(*this, DL, Info, I);
16356 case Intrinsic::aarch64_sve_st3:
16357 return setInfoSVEStN<3>(*this, DL, Info, I);
16358 case Intrinsic::aarch64_sve_st4:
16359 return setInfoSVEStN<4>(*this, DL, Info, I);
16360 case Intrinsic::aarch64_neon_ld2:
16361 case Intrinsic::aarch64_neon_ld3:
16362 case Intrinsic::aarch64_neon_ld4:
16363 case Intrinsic::aarch64_neon_ld1x2:
16364 case Intrinsic::aarch64_neon_ld1x3:
16365 case Intrinsic::aarch64_neon_ld1x4: {
16367 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16368 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16369 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16370 Info.offset = 0;
16371 Info.align.reset();
16372 // volatile loads with NEON intrinsics not supported
16374 return true;
16375 }
16376 case Intrinsic::aarch64_neon_ld2lane:
16377 case Intrinsic::aarch64_neon_ld3lane:
16378 case Intrinsic::aarch64_neon_ld4lane:
16379 case Intrinsic::aarch64_neon_ld2r:
16380 case Intrinsic::aarch64_neon_ld3r:
16381 case Intrinsic::aarch64_neon_ld4r: {
16383 // ldx return struct with the same vec type
16384 Type *RetTy = I.getType();
16385 auto *StructTy = cast<StructType>(RetTy);
16386 unsigned NumElts = StructTy->getNumElements();
16387 Type *VecTy = StructTy->getElementType(0);
16388 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16389 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16390 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16391 Info.offset = 0;
16392 Info.align.reset();
16393 // volatile loads with NEON intrinsics not supported
16395 return true;
16396 }
16397 case Intrinsic::aarch64_neon_st2:
16398 case Intrinsic::aarch64_neon_st3:
16399 case Intrinsic::aarch64_neon_st4:
16400 case Intrinsic::aarch64_neon_st1x2:
16401 case Intrinsic::aarch64_neon_st1x3:
16402 case Intrinsic::aarch64_neon_st1x4: {
16404 unsigned NumElts = 0;
16405 for (const Value *Arg : I.args()) {
16406 Type *ArgTy = Arg->getType();
16407 if (!ArgTy->isVectorTy())
16408 break;
16409 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16410 }
16411 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16412 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16413 Info.offset = 0;
16414 Info.align.reset();
16415 // volatile stores with NEON intrinsics not supported
16417 return true;
16418 }
16419 case Intrinsic::aarch64_neon_st2lane:
16420 case Intrinsic::aarch64_neon_st3lane:
16421 case Intrinsic::aarch64_neon_st4lane: {
16423 unsigned NumElts = 0;
16424 // all the vector type is same
16425 Type *VecTy = I.getArgOperand(0)->getType();
16426 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16427
16428 for (const Value *Arg : I.args()) {
16429 Type *ArgTy = Arg->getType();
16430 if (!ArgTy->isVectorTy())
16431 break;
16432 NumElts += 1;
16433 }
16434
16435 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16436 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16437 Info.offset = 0;
16438 Info.align.reset();
16439 // volatile stores with NEON intrinsics not supported
16441 return true;
16442 }
16443 case Intrinsic::aarch64_ldaxr:
16444 case Intrinsic::aarch64_ldxr: {
16445 Type *ValTy = I.getParamElementType(0);
16447 Info.memVT = MVT::getVT(ValTy);
16448 Info.ptrVal = I.getArgOperand(0);
16449 Info.offset = 0;
16450 Info.align = DL.getABITypeAlign(ValTy);
16452 return true;
16453 }
16454 case Intrinsic::aarch64_stlxr:
16455 case Intrinsic::aarch64_stxr: {
16456 Type *ValTy = I.getParamElementType(1);
16458 Info.memVT = MVT::getVT(ValTy);
16459 Info.ptrVal = I.getArgOperand(1);
16460 Info.offset = 0;
16461 Info.align = DL.getABITypeAlign(ValTy);
16463 return true;
16464 }
16465 case Intrinsic::aarch64_ldaxp:
16466 case Intrinsic::aarch64_ldxp:
16468 Info.memVT = MVT::i128;
16469 Info.ptrVal = I.getArgOperand(0);
16470 Info.offset = 0;
16471 Info.align = Align(16);
16473 return true;
16474 case Intrinsic::aarch64_stlxp:
16475 case Intrinsic::aarch64_stxp:
16477 Info.memVT = MVT::i128;
16478 Info.ptrVal = I.getArgOperand(2);
16479 Info.offset = 0;
16480 Info.align = Align(16);
16482 return true;
16483 case Intrinsic::aarch64_sve_ldnt1: {
16484 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
16486 Info.memVT = MVT::getVT(I.getType());
16487 Info.ptrVal = I.getArgOperand(1);
16488 Info.offset = 0;
16489 Info.align = DL.getABITypeAlign(ElTy);
16491 return true;
16492 }
16493 case Intrinsic::aarch64_sve_stnt1: {
16494 Type *ElTy =
16495 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
16497 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
16498 Info.ptrVal = I.getArgOperand(2);
16499 Info.offset = 0;
16500 Info.align = DL.getABITypeAlign(ElTy);
16502 return true;
16503 }
16504 case Intrinsic::aarch64_mops_memset_tag: {
16505 Value *Dst = I.getArgOperand(0);
16506 Value *Val = I.getArgOperand(1);
16508 Info.memVT = MVT::getVT(Val->getType());
16509 Info.ptrVal = Dst;
16510 Info.offset = 0;
16511 Info.align = I.getParamAlign(0).valueOrOne();
16513 // The size of the memory being operated on is unknown at this point
16515 return true;
16516 }
16517 default:
16518 break;
16519 }
16520
16521 return false;
16522}
16523
16525 ISD::LoadExtType ExtTy,
16526 EVT NewVT) const {
16527 // TODO: This may be worth removing. Check regression tests for diffs.
16528 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
16529 return false;
16530
16531 // If we're reducing the load width in order to avoid having to use an extra
16532 // instruction to do extension then it's probably a good idea.
16533 if (ExtTy != ISD::NON_EXTLOAD)
16534 return true;
16535 // Don't reduce load width if it would prevent us from combining a shift into
16536 // the offset.
16537 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
16538 assert(Mem);
16539 const SDValue &Base = Mem->getBasePtr();
16540 if (Base.getOpcode() == ISD::ADD &&
16541 Base.getOperand(1).getOpcode() == ISD::SHL &&
16542 Base.getOperand(1).hasOneUse() &&
16543 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
16544 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16545 if (Mem->getMemoryVT().isScalableVector())
16546 return false;
16547 // The shift can be combined if it matches the size of the value being
16548 // loaded (and so reducing the width would make it not match).
16549 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
16550 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16551 if (ShiftAmount == Log2_32(LoadBytes))
16552 return false;
16553 }
16554 // We have no reason to disallow reducing the load width, so allow it.
16555 return true;
16556}
16557
16558// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16560 EVT VT = Extend.getValueType();
16561 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16562 SDValue Extract = Extend.getOperand(0);
16563 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16564 Extract = Extract.getOperand(0);
16565 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16566 EVT VecVT = Extract.getOperand(0).getValueType();
16567 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16568 return false;
16569 }
16570 }
16571 return true;
16572}
16573
16574// Truncations from 64-bit GPR to 32-bit GPR is free.
16576 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16577 return false;
16578 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16579 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16580 return NumBits1 > NumBits2;
16581}
16583 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16584 return false;
16585 uint64_t NumBits1 = VT1.getFixedSizeInBits();
16586 uint64_t NumBits2 = VT2.getFixedSizeInBits();
16587 return NumBits1 > NumBits2;
16588}
16589
16590/// Check if it is profitable to hoist instruction in then/else to if.
16591/// Not profitable if I and it's user can form a FMA instruction
16592/// because we prefer FMSUB/FMADD.
16594 if (I->getOpcode() != Instruction::FMul)
16595 return true;
16596
16597 if (!I->hasOneUse())
16598 return true;
16599
16600 Instruction *User = I->user_back();
16601
16602 if (!(User->getOpcode() == Instruction::FSub ||
16603 User->getOpcode() == Instruction::FAdd))
16604 return true;
16605
16607 const Function *F = I->getFunction();
16608 const DataLayout &DL = F->getDataLayout();
16609 Type *Ty = User->getOperand(0)->getType();
16610
16611 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
16613 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16614 Options.UnsafeFPMath));
16615}
16616
16617// All 32-bit GPR operations implicitly zero the high-half of the corresponding
16618// 64-bit GPR.
16620 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16621 return false;
16622 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16623 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16624 return NumBits1 == 32 && NumBits2 == 64;
16625}
16627 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16628 return false;
16629 unsigned NumBits1 = VT1.getSizeInBits();
16630 unsigned NumBits2 = VT2.getSizeInBits();
16631 return NumBits1 == 32 && NumBits2 == 64;
16632}
16633
16635 EVT VT1 = Val.getValueType();
16636 if (isZExtFree(VT1, VT2)) {
16637 return true;
16638 }
16639
16640 if (Val.getOpcode() != ISD::LOAD)
16641 return false;
16642
16643 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16644 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16645 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16646 VT1.getSizeInBits() <= 32);
16647}
16648
16649bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
16650 if (isa<FPExtInst>(Ext))
16651 return false;
16652
16653 // Vector types are not free.
16654 if (Ext->getType()->isVectorTy())
16655 return false;
16656
16657 for (const Use &U : Ext->uses()) {
16658 // The extension is free if we can fold it with a left shift in an
16659 // addressing mode or an arithmetic operation: add, sub, and cmp.
16660
16661 // Is there a shift?
16662 const Instruction *Instr = cast<Instruction>(U.getUser());
16663
16664 // Is this a constant shift?
16665 switch (Instr->getOpcode()) {
16666 case Instruction::Shl:
16667 if (!isa<ConstantInt>(Instr->getOperand(1)))
16668 return false;
16669 break;
16670 case Instruction::GetElementPtr: {
16671 gep_type_iterator GTI = gep_type_begin(Instr);
16672 auto &DL = Ext->getDataLayout();
16673 std::advance(GTI, U.getOperandNo()-1);
16674 Type *IdxTy = GTI.getIndexedType();
16675 // This extension will end up with a shift because of the scaling factor.
16676 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16677 // Get the shift amount based on the scaling factor:
16678 // log2(sizeof(IdxTy)) - log2(8).
16679 if (IdxTy->isScalableTy())
16680 return false;
16681 uint64_t ShiftAmt =
16682 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
16683 3;
16684 // Is the constant foldable in the shift of the addressing mode?
16685 // I.e., shift amount is between 1 and 4 inclusive.
16686 if (ShiftAmt == 0 || ShiftAmt > 4)
16687 return false;
16688 break;
16689 }
16690 case Instruction::Trunc:
16691 // Check if this is a noop.
16692 // trunc(sext ty1 to ty2) to ty1.
16693 if (Instr->getType() == Ext->getOperand(0)->getType())
16694 continue;
16695 [[fallthrough]];
16696 default:
16697 return false;
16698 }
16699
16700 // At this point we can use the bfm family, so this extension is free
16701 // for that use.
16702 }
16703 return true;
16704}
16705
16706static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16707 unsigned NumElts, bool IsLittleEndian,
16708 SmallVectorImpl<int> &Mask) {
16709 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
16710 return false;
16711
16712 assert(DstWidth % SrcWidth == 0 &&
16713 "TBL lowering is not supported for a conversion instruction with this "
16714 "source and destination element type.");
16715
16716 unsigned Factor = DstWidth / SrcWidth;
16717 unsigned MaskLen = NumElts * Factor;
16718
16719 Mask.clear();
16720 Mask.resize(MaskLen, NumElts);
16721
16722 unsigned SrcIndex = 0;
16723 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16724 Mask[I] = SrcIndex++;
16725
16726 return true;
16727}
16728
16730 FixedVectorType *ZExtTy,
16731 FixedVectorType *DstTy,
16732 bool IsLittleEndian) {
16733 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16734 unsigned NumElts = SrcTy->getNumElements();
16735 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16736 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16737
16738 SmallVector<int> Mask;
16739 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16740 return nullptr;
16741
16742 auto *FirstEltZero = Builder.CreateInsertElement(
16743 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16744 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16745 Result = Builder.CreateBitCast(Result, DstTy);
16746 if (DstTy != ZExtTy)
16747 Result = Builder.CreateZExt(Result, ZExtTy);
16748 return Result;
16749}
16750
16752 FixedVectorType *DstTy,
16753 bool IsLittleEndian) {
16754 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16755 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16756 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16757
16758 SmallVector<int> Mask;
16759 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
16760 !IsLittleEndian, Mask))
16761 return nullptr;
16762
16763 auto *FirstEltZero = Builder.CreateInsertElement(
16764 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16765
16766 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16767}
16768
16769static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16770 IRBuilder<> Builder(TI);
16772 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16773 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16774 auto *DstTy = cast<FixedVectorType>(TI->getType());
16775 assert(SrcTy->getElementType()->isIntegerTy() &&
16776 "Non-integer type source vector element is not supported");
16777 assert(DstTy->getElementType()->isIntegerTy(8) &&
16778 "Unsupported destination vector element type");
16779 unsigned SrcElemTySz =
16780 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16781 unsigned DstElemTySz =
16782 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16783 assert((SrcElemTySz % DstElemTySz == 0) &&
16784 "Cannot lower truncate to tbl instructions for a source element size "
16785 "that is not divisible by the destination element size");
16786 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16787 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16788 "Unsupported source vector element type size");
16789 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16790
16791 // Create a mask to choose every nth byte from the source vector table of
16792 // bytes to create the truncated destination vector, where 'n' is the truncate
16793 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16794 // 0,8,16,..Y*8th bytes for the little-endian format
16796 for (int Itr = 0; Itr < 16; Itr++) {
16797 if (Itr < NumElements)
16798 MaskConst.push_back(Builder.getInt8(
16799 IsLittleEndian ? Itr * TruncFactor
16800 : Itr * TruncFactor + (TruncFactor - 1)));
16801 else
16802 MaskConst.push_back(Builder.getInt8(255));
16803 }
16804
16805 int MaxTblSz = 128 * 4;
16806 int MaxSrcSz = SrcElemTySz * NumElements;
16807 int ElemsPerTbl =
16808 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16809 assert(ElemsPerTbl <= 16 &&
16810 "Maximum elements selected using TBL instruction cannot exceed 16!");
16811
16812 int ShuffleCount = 128 / SrcElemTySz;
16813 SmallVector<int> ShuffleLanes;
16814 for (int i = 0; i < ShuffleCount; ++i)
16815 ShuffleLanes.push_back(i);
16816
16817 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16818 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16819 // call TBL & save the result in a vector of TBL results for combining later.
16821 while (ShuffleLanes.back() < NumElements) {
16822 Parts.push_back(Builder.CreateBitCast(
16823 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16824
16825 if (Parts.size() == 4) {
16826 Parts.push_back(ConstantVector::get(MaskConst));
16827 Results.push_back(
16828 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
16829 Parts.clear();
16830 }
16831
16832 for (int i = 0; i < ShuffleCount; ++i)
16833 ShuffleLanes[i] += ShuffleCount;
16834 }
16835
16836 assert((Parts.empty() || Results.empty()) &&
16837 "Lowering trunc for vectors requiring different TBL instructions is "
16838 "not supported!");
16839 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16840 // registers
16841 if (!Parts.empty()) {
16842 Intrinsic::ID TblID;
16843 switch (Parts.size()) {
16844 case 1:
16845 TblID = Intrinsic::aarch64_neon_tbl1;
16846 break;
16847 case 2:
16848 TblID = Intrinsic::aarch64_neon_tbl2;
16849 break;
16850 case 3:
16851 TblID = Intrinsic::aarch64_neon_tbl3;
16852 break;
16853 }
16854
16855 Parts.push_back(ConstantVector::get(MaskConst));
16856 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
16857 }
16858
16859 // Extract the destination vector from TBL result(s) after combining them
16860 // where applicable. Currently, at most two TBLs are supported.
16861 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16862 "more than 2 tbl instructions!");
16863 Value *FinalResult = Results[0];
16864 if (Results.size() == 1) {
16865 if (ElemsPerTbl < 16) {
16866 SmallVector<int> FinalMask(ElemsPerTbl);
16867 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16868 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16869 }
16870 } else {
16871 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16872 if (ElemsPerTbl < 16) {
16873 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16874 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16875 } else {
16876 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16877 }
16878 FinalResult =
16879 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16880 }
16881
16882 TI->replaceAllUsesWith(FinalResult);
16883 TI->eraseFromParent();
16884}
16885
16887 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16888 // shuffle_vector instructions are serialized when targeting SVE,
16889 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16890 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16891 return false;
16892
16893 // Try to optimize conversions using tbl. This requires materializing constant
16894 // index vectors, which can increase code size and add loads. Skip the
16895 // transform unless the conversion is in a loop block guaranteed to execute
16896 // and we are not optimizing for size.
16897 Function *F = I->getParent()->getParent();
16898 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16899 F->hasOptSize())
16900 return false;
16901
16902 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16903 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16904 if (!SrcTy || !DstTy)
16905 return false;
16906
16907 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16908 // lowered to tbl instructions to insert the original i8 elements
16909 // into i8x lanes. This is enabled for cases where it is beneficial.
16910 auto *ZExt = dyn_cast<ZExtInst>(I);
16911 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16912 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16913 if (DstWidth % 8 != 0)
16914 return false;
16915
16916 auto *TruncDstType =
16917 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16918 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16919 // the remaining ZExt folded into the user, don't use tbl lowering.
16920 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16921 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16924 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16925 return false;
16926
16927 DstTy = TruncDstType;
16928 }
16929
16930 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
16931 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
16932 // most one extra extend step is needed and using tbl is not profitable.
16933 if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
16934 auto *SingleUser = cast<Instruction>(*I->user_begin());
16935 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
16936 return false;
16937 }
16938
16939 if (DstTy->getScalarSizeInBits() >= 64)
16940 return false;
16941
16942 IRBuilder<> Builder(ZExt);
16944 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16945 DstTy, Subtarget->isLittleEndian());
16946 if (!Result)
16947 return false;
16948 ZExt->replaceAllUsesWith(Result);
16949 ZExt->eraseFromParent();
16950 return true;
16951 }
16952
16953 auto *UIToFP = dyn_cast<UIToFPInst>(I);
16954 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
16955 DstTy->getElementType()->isFloatTy()) ||
16956 (SrcTy->getElementType()->isIntegerTy(16) &&
16957 DstTy->getElementType()->isDoubleTy()))) {
16958 IRBuilder<> Builder(I);
16960 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16961 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16962 assert(ZExt && "Cannot fail for the i8 to float conversion");
16963 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16964 I->replaceAllUsesWith(UI);
16965 I->eraseFromParent();
16966 return true;
16967 }
16968
16969 auto *SIToFP = dyn_cast<SIToFPInst>(I);
16970 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16971 DstTy->getElementType()->isFloatTy()) {
16972 IRBuilder<> Builder(I);
16973 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
16975 Subtarget->isLittleEndian());
16976 assert(Shuffle && "Cannot fail for the i8 to float conversion");
16977 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
16978 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
16979 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
16980 I->replaceAllUsesWith(SI);
16981 I->eraseFromParent();
16982 return true;
16983 }
16984
16985 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16986 // followed by a truncate lowered to using tbl.4.
16987 auto *FPToUI = dyn_cast<FPToUIInst>(I);
16988 if (FPToUI &&
16989 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16990 SrcTy->getElementType()->isFloatTy() &&
16991 DstTy->getElementType()->isIntegerTy(8)) {
16992 IRBuilder<> Builder(I);
16993 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16994 VectorType::getInteger(SrcTy));
16995 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16996 I->replaceAllUsesWith(TruncI);
16997 I->eraseFromParent();
16998 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16999 return true;
17000 }
17001
17002 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17003 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17004 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17005 // registers
17006 auto *TI = dyn_cast<TruncInst>(I);
17007 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17008 ((SrcTy->getElementType()->isIntegerTy(32) ||
17009 SrcTy->getElementType()->isIntegerTy(64)) &&
17010 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17011 createTblForTrunc(TI, Subtarget->isLittleEndian());
17012 return true;
17013 }
17014
17015 return false;
17016}
17017
17019 Align &RequiredAligment) const {
17020 if (!LoadedType.isSimple() ||
17021 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17022 return false;
17023 // Cyclone supports unaligned accesses.
17024 RequiredAligment = Align(1);
17025 unsigned NumBits = LoadedType.getSizeInBits();
17026 return NumBits == 32 || NumBits == 64;
17027}
17028
17029/// A helper function for determining the number of interleaved accesses we
17030/// will generate when lowering accesses of the given type.
17032 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17033 unsigned VecSize = 128;
17034 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17035 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17036 if (UseScalable && isa<FixedVectorType>(VecTy))
17037 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17038 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17039}
17040
17043 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17044 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17045 return MOStridedAccess;
17047}
17048
17050 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17051 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17052 auto EC = VecTy->getElementCount();
17053 unsigned MinElts = EC.getKnownMinValue();
17054
17055 UseScalable = false;
17056
17057 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17058 (!Subtarget->useSVEForFixedLengthVectors() ||
17060 return false;
17061
17062 if (isa<ScalableVectorType>(VecTy) &&
17063 !Subtarget->isSVEorStreamingSVEAvailable())
17064 return false;
17065
17066 // Ensure the number of vector elements is greater than 1.
17067 if (MinElts < 2)
17068 return false;
17069
17070 // Ensure the element type is legal.
17071 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17072 return false;
17073
17074 if (EC.isScalable()) {
17075 UseScalable = true;
17076 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17077 }
17078
17079 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17080 if (Subtarget->useSVEForFixedLengthVectors()) {
17081 unsigned MinSVEVectorSize =
17082 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17083 if (VecSize % MinSVEVectorSize == 0 ||
17084 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17085 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17086 UseScalable = true;
17087 return true;
17088 }
17089 }
17090
17091 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17092 // 128 will be split into multiple interleaved accesses.
17093 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17094}
17095
17097 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17098 return ScalableVectorType::get(VTy->getElementType(), 2);
17099
17100 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17101 return ScalableVectorType::get(VTy->getElementType(), 4);
17102
17103 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17104 return ScalableVectorType::get(VTy->getElementType(), 8);
17105
17106 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17107 return ScalableVectorType::get(VTy->getElementType(), 8);
17108
17109 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17110 return ScalableVectorType::get(VTy->getElementType(), 2);
17111
17112 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17113 return ScalableVectorType::get(VTy->getElementType(), 4);
17114
17115 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17116 return ScalableVectorType::get(VTy->getElementType(), 8);
17117
17118 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17119 return ScalableVectorType::get(VTy->getElementType(), 16);
17120
17121 llvm_unreachable("Cannot handle input vector type");
17122}
17123
17124static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17125 bool Scalable, Type *LDVTy,
17126 Type *PtrTy) {
17127 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17128 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17129 Intrinsic::aarch64_sve_ld3_sret,
17130 Intrinsic::aarch64_sve_ld4_sret};
17131 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17132 Intrinsic::aarch64_neon_ld3,
17133 Intrinsic::aarch64_neon_ld4};
17134 if (Scalable)
17135 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17136
17137 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17138 {LDVTy, PtrTy});
17139}
17140
17141static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17142 bool Scalable, Type *STVTy,
17143 Type *PtrTy) {
17144 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17145 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17146 Intrinsic::aarch64_sve_st3,
17147 Intrinsic::aarch64_sve_st4};
17148 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17149 Intrinsic::aarch64_neon_st3,
17150 Intrinsic::aarch64_neon_st4};
17151 if (Scalable)
17152 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17153
17154 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17155 {STVTy, PtrTy});
17156}
17157
17158/// Lower an interleaved load into a ldN intrinsic.
17159///
17160/// E.g. Lower an interleaved load (Factor = 2):
17161/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17162/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17163/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17164///
17165/// Into:
17166/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17167/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17168/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17171 ArrayRef<unsigned> Indices, unsigned Factor) const {
17172 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17173 "Invalid interleave factor");
17174 assert(!Shuffles.empty() && "Empty shufflevector input");
17175 assert(Shuffles.size() == Indices.size() &&
17176 "Unmatched number of shufflevectors and indices");
17177
17178 const DataLayout &DL = LI->getDataLayout();
17179
17180 VectorType *VTy = Shuffles[0]->getType();
17181
17182 // Skip if we do not have NEON and skip illegal vector types. We can
17183 // "legalize" wide vector types into multiple interleaved accesses as long as
17184 // the vector types are divisible by 128.
17185 bool UseScalable;
17186 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17187 return false;
17188
17189 // Check if the interleave is a zext(shuffle), that can be better optimized
17190 // into shift / and masks. For the moment we do this just for uitofp (not
17191 // zext) to avoid issues with widening instructions.
17192 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17193 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17194 SI->getType()->getScalarSizeInBits() * 4 ==
17195 SI->user_back()->getType()->getScalarSizeInBits();
17196 }))
17197 return false;
17198
17199 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17200
17201 auto *FVTy = cast<FixedVectorType>(VTy);
17202
17203 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17204 // load integer vectors first and then convert to pointer vectors.
17205 Type *EltTy = FVTy->getElementType();
17206 if (EltTy->isPointerTy())
17207 FVTy =
17208 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17209
17210 // If we're going to generate more than one load, reset the sub-vector type
17211 // to something legal.
17212 FVTy = FixedVectorType::get(FVTy->getElementType(),
17213 FVTy->getNumElements() / NumLoads);
17214
17215 auto *LDVTy =
17216 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17217
17218 IRBuilder<> Builder(LI);
17219
17220 // The base address of the load.
17221 Value *BaseAddr = LI->getPointerOperand();
17222
17223 Type *PtrTy = LI->getPointerOperandType();
17224 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17225 LDVTy->getElementCount());
17226
17227 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17228 UseScalable, LDVTy, PtrTy);
17229
17230 // Holds sub-vectors extracted from the load intrinsic return values. The
17231 // sub-vectors are associated with the shufflevector instructions they will
17232 // replace.
17234
17235 Value *PTrue = nullptr;
17236 if (UseScalable) {
17237 std::optional<unsigned> PgPattern =
17238 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17239 if (Subtarget->getMinSVEVectorSizeInBits() ==
17240 Subtarget->getMaxSVEVectorSizeInBits() &&
17241 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17242 PgPattern = AArch64SVEPredPattern::all;
17243
17244 auto *PTruePat =
17245 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17246 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17247 {PTruePat});
17248 }
17249
17250 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17251
17252 // If we're generating more than one load, compute the base address of
17253 // subsequent loads as an offset from the previous.
17254 if (LoadCount > 0)
17255 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17256 FVTy->getNumElements() * Factor);
17257
17258 CallInst *LdN;
17259 if (UseScalable)
17260 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17261 else
17262 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17263
17264 // Extract and store the sub-vectors returned by the load intrinsic.
17265 for (unsigned i = 0; i < Shuffles.size(); i++) {
17266 ShuffleVectorInst *SVI = Shuffles[i];
17267 unsigned Index = Indices[i];
17268
17269 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17270
17271 if (UseScalable)
17272 SubVec = Builder.CreateExtractVector(
17273 FVTy, SubVec,
17274 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
17275
17276 // Convert the integer vector to pointer vector if the element is pointer.
17277 if (EltTy->isPointerTy())
17278 SubVec = Builder.CreateIntToPtr(
17280 FVTy->getNumElements()));
17281
17282 SubVecs[SVI].push_back(SubVec);
17283 }
17284 }
17285
17286 // Replace uses of the shufflevector instructions with the sub-vectors
17287 // returned by the load intrinsic. If a shufflevector instruction is
17288 // associated with more than one sub-vector, those sub-vectors will be
17289 // concatenated into a single wide vector.
17290 for (ShuffleVectorInst *SVI : Shuffles) {
17291 auto &SubVec = SubVecs[SVI];
17292 auto *WideVec =
17293 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17294 SVI->replaceAllUsesWith(WideVec);
17295 }
17296
17297 return true;
17298}
17299
17300template <typename Iter>
17301bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17302 int MaxLookupDist = 20;
17303 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17304 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17305 const Value *PtrA1 =
17306 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17307
17308 while (++It != End) {
17309 if (It->isDebugOrPseudoInst())
17310 continue;
17311 if (MaxLookupDist-- == 0)
17312 break;
17313 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17314 const Value *PtrB1 =
17315 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17316 DL, OffsetB);
17317 if (PtrA1 == PtrB1 &&
17318 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17319 .abs() == 16)
17320 return true;
17321 }
17322 }
17323
17324 return false;
17325}
17326
17327/// Lower an interleaved store into a stN intrinsic.
17328///
17329/// E.g. Lower an interleaved store (Factor = 3):
17330/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17331/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17332/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17333///
17334/// Into:
17335/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17336/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17337/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17338/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17339///
17340/// Note that the new shufflevectors will be removed and we'll only generate one
17341/// st3 instruction in CodeGen.
17342///
17343/// Example for a more general valid mask (Factor 3). Lower:
17344/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17345/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17346/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17347///
17348/// Into:
17349/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17350/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17351/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17352/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17354 ShuffleVectorInst *SVI,
17355 unsigned Factor) const {
17356
17357 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17358 "Invalid interleave factor");
17359
17360 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17361 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17362
17363 unsigned LaneLen = VecTy->getNumElements() / Factor;
17364 Type *EltTy = VecTy->getElementType();
17365 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17366
17367 const DataLayout &DL = SI->getDataLayout();
17368 bool UseScalable;
17369
17370 // Skip if we do not have NEON and skip illegal vector types. We can
17371 // "legalize" wide vector types into multiple interleaved accesses as long as
17372 // the vector types are divisible by 128.
17373 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17374 return false;
17375
17376 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17377
17378 Value *Op0 = SVI->getOperand(0);
17379 Value *Op1 = SVI->getOperand(1);
17380 IRBuilder<> Builder(SI);
17381
17382 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17383 // vectors to integer vectors.
17384 if (EltTy->isPointerTy()) {
17385 Type *IntTy = DL.getIntPtrType(EltTy);
17386 unsigned NumOpElts =
17387 cast<FixedVectorType>(Op0->getType())->getNumElements();
17388
17389 // Convert to the corresponding integer vector.
17390 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17391 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17392 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17393
17394 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17395 }
17396
17397 // If we're going to generate more than one store, reset the lane length
17398 // and sub-vector type to something legal.
17399 LaneLen /= NumStores;
17400 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17401
17402 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17403 : SubVecTy;
17404
17405 // The base address of the store.
17406 Value *BaseAddr = SI->getPointerOperand();
17407
17408 auto Mask = SVI->getShuffleMask();
17409
17410 // Sanity check if all the indices are NOT in range.
17411 // If mask is `poison`, `Mask` may be a vector of -1s.
17412 // If all of them are `poison`, OOB read will happen later.
17413 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17414 return false;
17415 }
17416 // A 64bit st2 which does not start at element 0 will involved adding extra
17417 // ext elements making the st2 unprofitable, and if there is a nearby store
17418 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17419 // zip;ldp pair which has higher throughput.
17420 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17421 (Mask[0] != 0 ||
17422 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17423 DL) ||
17424 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17425 BaseAddr, DL)))
17426 return false;
17427
17428 Type *PtrTy = SI->getPointerOperandType();
17429 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17430 STVTy->getElementCount());
17431
17432 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17433 UseScalable, STVTy, PtrTy);
17434
17435 Value *PTrue = nullptr;
17436 if (UseScalable) {
17437 std::optional<unsigned> PgPattern =
17438 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17439 if (Subtarget->getMinSVEVectorSizeInBits() ==
17440 Subtarget->getMaxSVEVectorSizeInBits() &&
17441 Subtarget->getMinSVEVectorSizeInBits() ==
17442 DL.getTypeSizeInBits(SubVecTy))
17443 PgPattern = AArch64SVEPredPattern::all;
17444
17445 auto *PTruePat =
17446 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
17447 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17448 {PTruePat});
17449 }
17450
17451 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17452
17454
17455 // Split the shufflevector operands into sub vectors for the new stN call.
17456 for (unsigned i = 0; i < Factor; i++) {
17457 Value *Shuffle;
17458 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17459 if (Mask[IdxI] >= 0) {
17460 Shuffle = Builder.CreateShuffleVector(
17461 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17462 } else {
17463 unsigned StartMask = 0;
17464 for (unsigned j = 1; j < LaneLen; j++) {
17465 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17466 if (Mask[IdxJ] >= 0) {
17467 StartMask = Mask[IdxJ] - j;
17468 break;
17469 }
17470 }
17471 // Note: Filling undef gaps with random elements is ok, since
17472 // those elements were being written anyway (with undefs).
17473 // In the case of all undefs we're defaulting to using elems from 0
17474 // Note: StartMask cannot be negative, it's checked in
17475 // isReInterleaveMask
17476 Shuffle = Builder.CreateShuffleVector(
17477 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17478 }
17479
17480 if (UseScalable)
17481 Shuffle = Builder.CreateInsertVector(
17482 STVTy, UndefValue::get(STVTy), Shuffle,
17483 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
17484
17485 Ops.push_back(Shuffle);
17486 }
17487
17488 if (UseScalable)
17489 Ops.push_back(PTrue);
17490
17491 // If we generating more than one store, we compute the base address of
17492 // subsequent stores as an offset from the previous.
17493 if (StoreCount > 0)
17494 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17495 BaseAddr, LaneLen * Factor);
17496
17497 Ops.push_back(BaseAddr);
17498 Builder.CreateCall(StNFunc, Ops);
17499 }
17500 return true;
17501}
17502
17504 LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
17505 unsigned Factor = DeinterleavedValues.size();
17506 if (Factor != 2 && Factor != 4) {
17507 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17508 return false;
17509 }
17510
17511 VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
17512
17513 const DataLayout &DL = LI->getModule()->getDataLayout();
17514 bool UseScalable;
17515 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17516 return false;
17517
17518 // TODO: Add support for using SVE instructions with fixed types later, using
17519 // the code from lowerInterleavedLoad to obtain the correct container type.
17520 if (UseScalable && !VTy->isScalableTy())
17521 return false;
17522
17523 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17524 VectorType *LdTy =
17526 VTy->getElementCount().divideCoefficientBy(NumLoads));
17527
17528 Type *PtrTy = LI->getPointerOperandType();
17529 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17530 UseScalable, LdTy, PtrTy);
17531
17532 IRBuilder<> Builder(LI);
17533 Value *Pred = nullptr;
17534 if (UseScalable)
17535 Pred =
17536 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17537
17538 Value *BaseAddr = LI->getPointerOperand();
17539 if (NumLoads > 1) {
17540 // Create multiple legal small ldN.
17541 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
17542 for (unsigned I = 0; I < NumLoads; ++I) {
17543 Value *Offset = Builder.getInt64(I * Factor);
17544
17545 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17546 Value *LdN = nullptr;
17547 if (UseScalable)
17548 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17549 else
17550 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17551 Value *Idx =
17552 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17553 for (unsigned J = 0; J < Factor; ++J) {
17554 ExtractedLdValues[J] = Builder.CreateInsertVector(
17555 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
17556 }
17557 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17558 }
17559 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17560 for (unsigned J = 0; J < Factor; ++J)
17561 DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17562 } else {
17563 Value *Result;
17564 if (UseScalable)
17565 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17566 else
17567 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17568 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17569 for (unsigned I = 0; I < Factor; I++) {
17570 Value *NewExtract = Builder.CreateExtractValue(Result, I);
17571 DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17572 }
17573 }
17574 return true;
17575}
17576
17578 StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
17579 unsigned Factor = InterleavedValues.size();
17580 if (Factor != 2 && Factor != 4) {
17581 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17582 return false;
17583 }
17584
17585 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
17586 const DataLayout &DL = SI->getModule()->getDataLayout();
17587
17588 bool UseScalable;
17589 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17590 return false;
17591
17592 // TODO: Add support for using SVE instructions with fixed types later, using
17593 // the code from lowerInterleavedStore to obtain the correct container type.
17594 if (UseScalable && !VTy->isScalableTy())
17595 return false;
17596
17597 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17598
17599 VectorType *StTy =
17601 VTy->getElementCount().divideCoefficientBy(NumStores));
17602
17603 Type *PtrTy = SI->getPointerOperandType();
17604 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17605 UseScalable, StTy, PtrTy);
17606
17607 IRBuilder<> Builder(SI);
17608
17609 Value *BaseAddr = SI->getPointerOperand();
17610 Value *Pred = nullptr;
17611
17612 if (UseScalable)
17613 Pred =
17614 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
17615
17616 auto ExtractedValues = InterleavedValues;
17617 SmallVector<Value *, 4> StoreOperands(InterleavedValues.begin(),
17618 InterleavedValues.end());
17619 if (UseScalable)
17620 StoreOperands.push_back(Pred);
17621 StoreOperands.push_back(BaseAddr);
17622 for (unsigned I = 0; I < NumStores; ++I) {
17623 Value *Address = BaseAddr;
17624 if (NumStores > 1) {
17625 Value *Offset = Builder.getInt64(I * Factor);
17626 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
17627 Value *Idx =
17628 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
17629 for (unsigned J = 0; J < Factor; J++) {
17630 StoreOperands[J] =
17631 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
17632 }
17633 // update the address
17634 StoreOperands[StoreOperands.size() - 1] = Address;
17635 }
17636 Builder.CreateCall(StNFunc, StoreOperands);
17637 }
17638 return true;
17639}
17640
17642 const MemOp &Op, const AttributeList &FuncAttributes) const {
17643 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17644 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17645 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17646 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17647 // taken one instruction to materialize the v2i64 zero and one store (with
17648 // restrictive addressing mode). Just do i64 stores.
17649 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17650 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17651 if (Op.isAligned(AlignCheck))
17652 return true;
17653 unsigned Fast;
17654 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17656 Fast;
17657 };
17658
17659 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17660 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17661 return MVT::v16i8;
17662 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17663 return MVT::f128;
17664 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17665 return MVT::i64;
17666 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17667 return MVT::i32;
17668 return MVT::Other;
17669}
17670
17672 const MemOp &Op, const AttributeList &FuncAttributes) const {
17673 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17674 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17675 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17676 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17677 // taken one instruction to materialize the v2i64 zero and one store (with
17678 // restrictive addressing mode). Just do i64 stores.
17679 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17680 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17681 if (Op.isAligned(AlignCheck))
17682 return true;
17683 unsigned Fast;
17684 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17686 Fast;
17687 };
17688
17689 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17690 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17691 return LLT::fixed_vector(2, 64);
17692 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17693 return LLT::scalar(128);
17694 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17695 return LLT::scalar(64);
17696 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17697 return LLT::scalar(32);
17698 return LLT();
17699}
17700
17701// 12-bit optionally shifted immediates are legal for adds.
17703 if (Immed == std::numeric_limits<int64_t>::min()) {
17704 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17705 << ": avoid UB for INT64_MIN\n");
17706 return false;
17707 }
17708 // Same encoding for add/sub, just flip the sign.
17709 Immed = std::abs(Immed);
17710 bool IsLegal = ((Immed >> 12) == 0 ||
17711 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17712 LLVM_DEBUG(dbgs() << "Is " << Immed
17713 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17714 return IsLegal;
17715}
17716
17718 // We will only emit addvl/inc* instructions for SVE2
17719 if (!Subtarget->hasSVE2())
17720 return false;
17721
17722 // addvl's immediates are in terms of the number of bytes in a register.
17723 // Since there are 16 in the base supported size (128bits), we need to
17724 // divide the immediate by that much to give us a useful immediate to
17725 // multiply by vscale. We can't have a remainder as a result of this.
17726 if (Imm % 16 == 0)
17727 return isInt<6>(Imm / 16);
17728
17729 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17730 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17731 // of addvl as a result, so only take h|w|d into account.
17732 // Dec[h|w|d] will cover subtractions.
17733 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17734 // FIXME: Can we make use of other patterns to cover other immediates?
17735
17736 // inch|dech
17737 if (Imm % 8 == 0)
17738 return std::abs(Imm / 8) <= 16;
17739 // incw|decw
17740 if (Imm % 4 == 0)
17741 return std::abs(Imm / 4) <= 16;
17742 // incd|decd
17743 if (Imm % 2 == 0)
17744 return std::abs(Imm / 2) <= 16;
17745
17746 return false;
17747}
17748
17749// Return false to prevent folding
17750// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17751// if the folding leads to worse code.
17753 SDValue AddNode, SDValue ConstNode) const {
17754 // Let the DAGCombiner decide for vector types and large types.
17755 const EVT VT = AddNode.getValueType();
17756 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17757 return true;
17758
17759 // It is worse if c1 is legal add immediate, while c1*c2 is not
17760 // and has to be composed by at least two instructions.
17761 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17762 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17763 const int64_t C1 = C1Node->getSExtValue();
17764 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17766 return true;
17768 // Adapt to the width of a register.
17769 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17771 if (Insn.size() > 1)
17772 return false;
17773
17774 // Default to true and let the DAGCombiner decide.
17775 return true;
17776}
17777
17778// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17779// immediates is the same as for an add or a sub.
17781 return isLegalAddImmediate(Immed);
17782}
17783
17784/// isLegalAddressingMode - Return true if the addressing mode represented
17785/// by AM is legal for this target, for a load/store of the specified type.
17787 const AddrMode &AMode, Type *Ty,
17788 unsigned AS, Instruction *I) const {
17789 // AArch64 has five basic addressing modes:
17790 // reg
17791 // reg + 9-bit signed offset
17792 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17793 // reg1 + reg2
17794 // reg + SIZE_IN_BYTES * reg
17795
17796 // No global is ever allowed as a base.
17797 if (AMode.BaseGV)
17798 return false;
17799
17800 // No reg+reg+imm addressing.
17801 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17802 return false;
17803
17804 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17805 // `2*ScaledReg` into `BaseReg + ScaledReg`
17806 AddrMode AM = AMode;
17807 if (AM.Scale && !AM.HasBaseReg) {
17808 if (AM.Scale == 1) {
17809 AM.HasBaseReg = true;
17810 AM.Scale = 0;
17811 } else if (AM.Scale == 2) {
17812 AM.HasBaseReg = true;
17813 AM.Scale = 1;
17814 } else {
17815 return false;
17816 }
17817 }
17818
17819 // A base register is required in all addressing modes.
17820 if (!AM.HasBaseReg)
17821 return false;
17822
17823 if (Ty->isScalableTy()) {
17824 if (isa<ScalableVectorType>(Ty)) {
17825 // See if we have a foldable vscale-based offset, for vector types which
17826 // are either legal or smaller than the minimum; more work will be
17827 // required if we need to consider addressing for types which need
17828 // legalization by splitting.
17829 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17830 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17831 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17832 isPowerOf2_64(VecNumBytes))
17833 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17834
17835 uint64_t VecElemNumBytes =
17836 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17837 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17838 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17839 }
17840
17841 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17842 }
17843
17844 // No scalable offsets allowed for non-scalable types.
17845 if (AM.ScalableOffset)
17846 return false;
17847
17848 // check reg + imm case:
17849 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17850 uint64_t NumBytes = 0;
17851 if (Ty->isSized()) {
17852 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17853 NumBytes = NumBits / 8;
17854 if (!isPowerOf2_64(NumBits))
17855 NumBytes = 0;
17856 }
17857
17858 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
17859 AM.Scale);
17860}
17861
17862// Check whether the 2 offsets belong to the same imm24 range, and their high
17863// 12bits are same, then their high part can be decoded with the offset of add.
17864int64_t
17866 int64_t MaxOffset) const {
17867 int64_t HighPart = MinOffset & ~0xfffULL;
17868 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
17869 // Rebase the value to an integer multiple of imm12.
17870 return HighPart;
17871 }
17872
17873 return 0;
17874}
17875
17877 // Consider splitting large offset of struct or array.
17878 return true;
17879}
17880
17882 const MachineFunction &MF, EVT VT) const {
17883 VT = VT.getScalarType();
17884
17885 if (!VT.isSimple())
17886 return false;
17887
17888 switch (VT.getSimpleVT().SimpleTy) {
17889 case MVT::f16:
17890 return Subtarget->hasFullFP16();
17891 case MVT::f32:
17892 case MVT::f64:
17893 return true;
17894 default:
17895 break;
17896 }
17897
17898 return false;
17899}
17900
17902 Type *Ty) const {
17903 switch (Ty->getScalarType()->getTypeID()) {
17904 case Type::FloatTyID:
17905 case Type::DoubleTyID:
17906 return true;
17907 default:
17908 return false;
17909 }
17910}
17911
17913 EVT VT, CodeGenOptLevel OptLevel) const {
17914 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17916}
17917
17918const MCPhysReg *
17920 // LR is a callee-save register, but we must treat it as clobbered by any call
17921 // site. Hence we include LR in the scratch registers, which are in turn added
17922 // as implicit-defs for stackmaps and patchpoints.
17923 static const MCPhysReg ScratchRegs[] = {
17924 AArch64::X16, AArch64::X17, AArch64::LR, 0
17925 };
17926 return ScratchRegs;
17927}
17928
17930 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17931 return RCRegs;
17932}
17933
17934bool
17936 CombineLevel Level) const {
17937 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17938 N->getOpcode() == ISD::SRL) &&
17939 "Expected shift op");
17940
17941 SDValue ShiftLHS = N->getOperand(0);
17942 EVT VT = N->getValueType(0);
17943
17944 if (!ShiftLHS->hasOneUse())
17945 return false;
17946
17947 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
17948 !ShiftLHS.getOperand(0)->hasOneUse())
17949 return false;
17950
17951 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17952 // combine it with shift 'N' to let it be lowered to UBFX except:
17953 // ((x >> C) & mask) << C.
17954 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17955 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
17956 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
17957 if (isMask_64(TruncMask)) {
17958 SDValue AndLHS = ShiftLHS.getOperand(0);
17959 if (AndLHS.getOpcode() == ISD::SRL) {
17960 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
17961 if (N->getOpcode() == ISD::SHL)
17962 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
17963 return SRLC->getZExtValue() == SHLC->getZExtValue();
17964 return false;
17965 }
17966 }
17967 }
17968 }
17969 return true;
17970}
17971
17973 const SDNode *N) const {
17974 assert(N->getOpcode() == ISD::XOR &&
17975 (N->getOperand(0).getOpcode() == ISD::SHL ||
17976 N->getOperand(0).getOpcode() == ISD::SRL) &&
17977 "Expected XOR(SHIFT) pattern");
17978
17979 // Only commute if the entire NOT mask is a hidden shifted mask.
17980 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
17981 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17982 if (XorC && ShiftC) {
17983 unsigned MaskIdx, MaskLen;
17984 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17985 unsigned ShiftAmt = ShiftC->getZExtValue();
17986 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17987 if (N->getOperand(0).getOpcode() == ISD::SHL)
17988 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17989 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17990 }
17991 }
17992
17993 return false;
17994}
17995
17997 const SDNode *N, CombineLevel Level) const {
17998 assert(((N->getOpcode() == ISD::SHL &&
17999 N->getOperand(0).getOpcode() == ISD::SRL) ||
18000 (N->getOpcode() == ISD::SRL &&
18001 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18002 "Expected shift-shift mask");
18003 // Don't allow multiuse shift folding with the same shift amount.
18004 if (!N->getOperand(0)->hasOneUse())
18005 return false;
18006
18007 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18008 EVT VT = N->getValueType(0);
18009 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18010 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18011 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18012 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18013 }
18014
18015 // We do not need to fold when this shifting used in specific load case:
18016 // (ldr x, (add x, (shl (srl x, c1) 2)))
18017 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18018 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18019 unsigned ShlAmt = C2->getZExtValue();
18020 if (auto ShouldADD = *N->user_begin();
18021 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18022 if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18023 unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
18024 if ((1ULL << ShlAmt) == ByteVT &&
18025 isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
18026 return false;
18027 }
18028 }
18029 }
18030 }
18031
18032 return true;
18033}
18034
18036 unsigned BinOpcode, EVT VT) const {
18037 return VT.isScalableVector() && isTypeLegal(VT);
18038}
18039
18041 Type *Ty) const {
18042 assert(Ty->isIntegerTy());
18043
18044 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18045 if (BitSize == 0)
18046 return false;
18047
18048 int64_t Val = Imm.getSExtValue();
18049 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18050 return true;
18051
18052 if ((int64_t)Val < 0)
18053 Val = ~Val;
18054 if (BitSize == 32)
18055 Val &= (1LL << 32) - 1;
18056
18057 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18058 // MOVZ is free so return true for one or fewer MOVK.
18059 return Shift < 3;
18060}
18061
18063 unsigned Index) const {
18065 return false;
18066
18067 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18068}
18069
18070/// Turn vector tests of the signbit in the form of:
18071/// xor (sra X, elt_size(X)-1), -1
18072/// into:
18073/// cmge X, X, #0
18075 const AArch64Subtarget *Subtarget) {
18076 EVT VT = N->getValueType(0);
18077 if (!Subtarget->hasNEON() || !VT.isVector())
18078 return SDValue();
18079
18080 // There must be a shift right algebraic before the xor, and the xor must be a
18081 // 'not' operation.
18082 SDValue Shift = N->getOperand(0);
18083 SDValue Ones = N->getOperand(1);
18084 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18086 return SDValue();
18087
18088 // The shift should be smearing the sign bit across each vector element.
18089 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18090 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18091 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18092 return SDValue();
18093
18094 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
18095}
18096
18097// Given a vecreduce_add node, detect the below pattern and convert it to the
18098// node sequence with UABDL, [S|U]ADB and UADDLP.
18099//
18100// i32 vecreduce_add(
18101// v16i32 abs(
18102// v16i32 sub(
18103// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18104// =================>
18105// i32 vecreduce_add(
18106// v4i32 UADDLP(
18107// v8i16 add(
18108// v8i16 zext(
18109// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18110// v8i16 zext(
18111// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18113 SelectionDAG &DAG) {
18114 // Assumed i32 vecreduce_add
18115 if (N->getValueType(0) != MVT::i32)
18116 return SDValue();
18117
18118 SDValue VecReduceOp0 = N->getOperand(0);
18119 unsigned Opcode = VecReduceOp0.getOpcode();
18120 // Assumed v16i32 abs
18121 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
18122 return SDValue();
18123
18124 SDValue ABS = VecReduceOp0;
18125 // Assumed v16i32 sub
18126 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18127 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
18128 return SDValue();
18129
18130 SDValue SUB = ABS->getOperand(0);
18131 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18132 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18133 // Assumed v16i32 type
18134 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
18135 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
18136 return SDValue();
18137
18138 // Assumed zext or sext
18139 bool IsZExt = false;
18140 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18141 IsZExt = true;
18142 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18143 IsZExt = false;
18144 } else
18145 return SDValue();
18146
18147 SDValue EXT0 = SUB->getOperand(0);
18148 SDValue EXT1 = SUB->getOperand(1);
18149 // Assumed zext's operand has v16i8 type
18150 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18151 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18152 return SDValue();
18153
18154 // Pattern is dectected. Let's convert it to sequence of nodes.
18155 SDLoc DL(N);
18156
18157 // First, create the node pattern of UABD/SABD.
18158 SDValue UABDHigh8Op0 =
18159 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18160 DAG.getConstant(8, DL, MVT::i64));
18161 SDValue UABDHigh8Op1 =
18162 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18163 DAG.getConstant(8, DL, MVT::i64));
18164 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18165 UABDHigh8Op0, UABDHigh8Op1);
18166 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18167
18168 // Second, create the node pattern of UABAL.
18169 SDValue UABDLo8Op0 =
18170 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18171 DAG.getConstant(0, DL, MVT::i64));
18172 SDValue UABDLo8Op1 =
18173 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18174 DAG.getConstant(0, DL, MVT::i64));
18175 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18176 UABDLo8Op0, UABDLo8Op1);
18177 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18178 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18179
18180 // Third, create the node of UADDLP.
18181 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18182
18183 // Fourth, create the node of VECREDUCE_ADD.
18184 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18185}
18186
18187// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18188// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18189// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18190// If we have vectors larger than v16i8 we extract v16i8 vectors,
18191// Follow the same steps above to get DOT instructions concatenate them
18192// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18194 const AArch64Subtarget *ST) {
18195 if (!ST->isNeonAvailable())
18196 return SDValue();
18197
18198 if (!ST->hasDotProd())
18200
18201 SDValue Op0 = N->getOperand(0);
18202 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18203 Op0.getValueType().getVectorElementType() != MVT::i32)
18204 return SDValue();
18205
18206 unsigned ExtOpcode = Op0.getOpcode();
18207 SDValue A = Op0;
18208 SDValue B;
18209 unsigned DotOpcode;
18210 if (ExtOpcode == ISD::MUL) {
18211 A = Op0.getOperand(0);
18212 B = Op0.getOperand(1);
18213 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18214 return SDValue();
18215 auto OpCodeA = A.getOpcode();
18216 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18217 return SDValue();
18218
18219 auto OpCodeB = B.getOpcode();
18220 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18221 return SDValue();
18222
18223 if (OpCodeA == OpCodeB) {
18224 DotOpcode =
18226 } else {
18227 // Check USDOT support support
18228 if (!ST->hasMatMulInt8())
18229 return SDValue();
18230 DotOpcode = AArch64ISD::USDOT;
18231 if (OpCodeA == ISD::SIGN_EXTEND)
18232 std::swap(A, B);
18233 }
18234 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18235 DotOpcode = AArch64ISD::UDOT;
18236 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18237 DotOpcode = AArch64ISD::SDOT;
18238 } else {
18239 return SDValue();
18240 }
18241
18242 EVT Op0VT = A.getOperand(0).getValueType();
18243 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18244 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18245 if (!IsValidElementCount || !IsValidSize)
18246 return SDValue();
18247
18248 SDLoc DL(Op0);
18249 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18250 // the extend B.
18251 if (!B)
18252 B = DAG.getConstant(1, DL, Op0VT);
18253 else
18254 B = B.getOperand(0);
18255
18256 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18257 unsigned NumOfVecReduce;
18258 EVT TargetType;
18259 if (IsMultipleOf16) {
18260 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18261 TargetType = MVT::v4i32;
18262 } else {
18263 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18264 TargetType = MVT::v2i32;
18265 }
18266 // Handle the case where we need to generate only one Dot operation.
18267 if (NumOfVecReduce == 1) {
18268 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18269 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18270 A.getOperand(0), B);
18271 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18272 }
18273 // Generate Dot instructions that are multiple of 16.
18274 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18275 SmallVector<SDValue, 4> SDotVec16;
18276 unsigned I = 0;
18277 for (; I < VecReduce16Num; I += 1) {
18278 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18279 SDValue Op0 =
18280 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18281 DAG.getConstant(I * 16, DL, MVT::i64));
18282 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18283 DAG.getConstant(I * 16, DL, MVT::i64));
18284 SDValue Dot =
18285 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18286 SDotVec16.push_back(Dot);
18287 }
18288 // Concatenate dot operations.
18289 EVT SDot16EVT =
18290 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18291 SDValue ConcatSDot16 =
18292 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18293 SDValue VecReduceAdd16 =
18294 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18295 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18296 if (VecReduce8Num == 0)
18297 return VecReduceAdd16;
18298
18299 // Generate the remainder Dot operation that is multiple of 8.
18300 SmallVector<SDValue, 4> SDotVec8;
18301 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18302 SDValue Vec8Op0 =
18303 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18304 DAG.getConstant(I * 16, DL, MVT::i64));
18305 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18306 DAG.getConstant(I * 16, DL, MVT::i64));
18307 SDValue Dot =
18308 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18309 SDValue VecReudceAdd8 =
18310 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18311 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18312 VecReudceAdd8);
18313}
18314
18315// Given an (integer) vecreduce, we know the order of the inputs does not
18316// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18317// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18318// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18320 auto DetectAddExtract = [&](SDValue A) {
18321 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18322 // UADDLP(x) if found.
18323 assert(A.getOpcode() == ISD::ADD);
18324 EVT VT = A.getValueType();
18325 SDValue Op0 = A.getOperand(0);
18326 SDValue Op1 = A.getOperand(1);
18327 if (Op0.getOpcode() != Op1.getOpcode() ||
18328 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18329 Op0.getOpcode() != ISD::SIGN_EXTEND))
18330 return SDValue();
18331 SDValue Ext0 = Op0.getOperand(0);
18332 SDValue Ext1 = Op1.getOperand(0);
18333 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18335 Ext0.getOperand(0) != Ext1.getOperand(0))
18336 return SDValue();
18337 // Check that the type is twice the add types, and the extract are from
18338 // upper/lower parts of the same source.
18340 VT.getVectorNumElements() * 2)
18341 return SDValue();
18342 if ((Ext0.getConstantOperandVal(1) != 0 ||
18344 (Ext1.getConstantOperandVal(1) != 0 ||
18346 return SDValue();
18347 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18349 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
18350 };
18351
18352 if (SDValue R = DetectAddExtract(A))
18353 return R;
18354
18355 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
18356 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
18357 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18358 A.getOperand(1));
18359 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
18360 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
18361 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18362 A.getOperand(0));
18363 return SDValue();
18364}
18365
18366// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18367// UADDLV(concat), where the concat represents the 64-bit zext sources.
18369 // Look for add(zext(64-bit source), zext(64-bit source)), returning
18370 // UADDLV(concat(zext, zext)) if found.
18371 assert(A.getOpcode() == ISD::ADD);
18372 EVT VT = A.getValueType();
18373 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18374 return SDValue();
18375 SDValue Op0 = A.getOperand(0);
18376 SDValue Op1 = A.getOperand(1);
18377 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18378 return SDValue();
18379 SDValue Ext0 = Op0.getOperand(0);
18380 SDValue Ext1 = Op1.getOperand(0);
18381 EVT ExtVT0 = Ext0.getValueType();
18382 EVT ExtVT1 = Ext1.getValueType();
18383 // Check zext VTs are the same and 64-bit length.
18384 if (ExtVT0 != ExtVT1 ||
18385 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18386 return SDValue();
18387 // Get VT for concat of zext sources.
18388 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
18389 SDValue Concat =
18390 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
18391
18392 switch (VT.getSimpleVT().SimpleTy) {
18393 case MVT::v2i64:
18394 case MVT::v4i32:
18395 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
18396 case MVT::v8i16: {
18397 SDValue Uaddlv =
18398 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
18399 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
18400 }
18401 default:
18402 llvm_unreachable("Unhandled vector type");
18403 }
18404}
18405
18407 SDValue A = N->getOperand(0);
18408 if (A.getOpcode() == ISD::ADD) {
18409 if (SDValue R = performUADDVAddCombine(A, DAG))
18410 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
18411 else if (SDValue R = performUADDVZextCombine(A, DAG))
18412 return R;
18413 }
18414 return SDValue();
18415}
18416
18419 const AArch64Subtarget *Subtarget) {
18420 if (DCI.isBeforeLegalizeOps())
18421 return SDValue();
18422
18423 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18424}
18425
18426SDValue
18427AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18428 SelectionDAG &DAG,
18429 SmallVectorImpl<SDNode *> &Created) const {
18431 if (isIntDivCheap(N->getValueType(0), Attr))
18432 return SDValue(N, 0); // Lower SDIV as SDIV
18433
18434 EVT VT = N->getValueType(0);
18435
18436 // For scalable and fixed types, mark them as cheap so we can handle it much
18437 // later. This allows us to handle larger than legal types.
18438 if (VT.isScalableVector() ||
18439 (VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
18440 return SDValue(N, 0);
18441
18442 // fold (sdiv X, pow2)
18443 if ((VT != MVT::i32 && VT != MVT::i64) ||
18444 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18445 return SDValue();
18446
18447 // If the divisor is 2 or -2, the default expansion is better. It will add
18448 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18449 if (Divisor == 2 ||
18450 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18451 return SDValue();
18452
18453 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18454}
18455
18456SDValue
18457AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18458 SelectionDAG &DAG,
18459 SmallVectorImpl<SDNode *> &Created) const {
18461 if (isIntDivCheap(N->getValueType(0), Attr))
18462 return SDValue(N, 0); // Lower SREM as SREM
18463
18464 EVT VT = N->getValueType(0);
18465
18466 // For scalable and fixed types, mark them as cheap so we can handle it much
18467 // later. This allows us to handle larger than legal types.
18468 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18469 return SDValue(N, 0);
18470
18471 // fold (srem X, pow2)
18472 if ((VT != MVT::i32 && VT != MVT::i64) ||
18473 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18474 return SDValue();
18475
18476 unsigned Lg2 = Divisor.countr_zero();
18477 if (Lg2 == 0)
18478 return SDValue();
18479
18480 SDLoc DL(N);
18481 SDValue N0 = N->getOperand(0);
18482 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
18483 SDValue Zero = DAG.getConstant(0, DL, VT);
18484 SDValue CCVal, CSNeg;
18485 if (Lg2 == 1) {
18486 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
18487 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18488 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
18489
18490 Created.push_back(Cmp.getNode());
18491 Created.push_back(And.getNode());
18492 } else {
18493 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
18494 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18495
18496 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
18497 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18498 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
18499 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
18500 Negs.getValue(1));
18501
18502 Created.push_back(Negs.getNode());
18503 Created.push_back(AndPos.getNode());
18504 Created.push_back(AndNeg.getNode());
18505 }
18506
18507 return CSNeg;
18508}
18509
18510static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18511 switch(getIntrinsicID(S.getNode())) {
18512 default:
18513 break;
18514 case Intrinsic::aarch64_sve_cntb:
18515 return 8;
18516 case Intrinsic::aarch64_sve_cnth:
18517 return 16;
18518 case Intrinsic::aarch64_sve_cntw:
18519 return 32;
18520 case Intrinsic::aarch64_sve_cntd:
18521 return 64;
18522 }
18523 return {};
18524}
18525
18526/// Calculates what the pre-extend type is, based on the extension
18527/// operation node provided by \p Extend.
18528///
18529/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18530/// pre-extend type is pulled directly from the operand, while other extend
18531/// operations need a bit more inspection to get this information.
18532///
18533/// \param Extend The SDNode from the DAG that represents the extend operation
18534///
18535/// \returns The type representing the \p Extend source type, or \p MVT::Other
18536/// if no valid type can be determined
18538 switch (Extend.getOpcode()) {
18539 case ISD::SIGN_EXTEND:
18540 case ISD::ZERO_EXTEND:
18541 case ISD::ANY_EXTEND:
18542 return Extend.getOperand(0).getValueType();
18543 case ISD::AssertSext:
18544 case ISD::AssertZext:
18546 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
18547 if (!TypeNode)
18548 return MVT::Other;
18549 return TypeNode->getVT();
18550 }
18551 case ISD::AND: {
18553 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
18554 if (!Constant)
18555 return MVT::Other;
18556
18557 uint32_t Mask = Constant->getZExtValue();
18558
18559 if (Mask == UCHAR_MAX)
18560 return MVT::i8;
18561 else if (Mask == USHRT_MAX)
18562 return MVT::i16;
18563 else if (Mask == UINT_MAX)
18564 return MVT::i32;
18565
18566 return MVT::Other;
18567 }
18568 default:
18569 return MVT::Other;
18570 }
18571}
18572
18573/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18574/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18575/// SExt/ZExt rather than the scalar SExt/ZExt
18577 EVT VT = BV.getValueType();
18578 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18580 return SDValue();
18581
18582 // Use the first item in the buildvector/shuffle to get the size of the
18583 // extend, and make sure it looks valid.
18584 SDValue Extend = BV->getOperand(0);
18585 unsigned ExtendOpcode = Extend.getOpcode();
18586 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18587 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18588 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18589 ExtendOpcode == ISD::AssertSext;
18590 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18591 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18592 return SDValue();
18593 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18594 // ensure calculatePreExtendType will work without issue.
18595 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18596 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18597 return SDValue();
18598
18599 // Restrict valid pre-extend data type
18600 EVT PreExtendType = calculatePreExtendType(Extend);
18601 if (PreExtendType == MVT::Other ||
18602 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18603 return SDValue();
18604
18605 // Make sure all other operands are equally extended.
18606 bool SeenZExtOrSExt = !IsAnyExt;
18607 for (SDValue Op : drop_begin(BV->ops())) {
18608 if (Op.isUndef())
18609 continue;
18610
18611 if (calculatePreExtendType(Op) != PreExtendType)
18612 return SDValue();
18613
18614 unsigned Opc = Op.getOpcode();
18615 if (Opc == ISD::ANY_EXTEND)
18616 continue;
18617
18618 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18619 Opc == ISD::AssertSext;
18620
18621 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
18622 return SDValue();
18623
18624 IsSExt = OpcIsSExt;
18625 SeenZExtOrSExt = true;
18626 }
18627
18628 SDValue NBV;
18629 SDLoc DL(BV);
18630 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18631 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
18632 EVT PreExtendLegalType =
18633 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18635 for (SDValue Op : BV->ops())
18636 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
18637 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
18638 PreExtendLegalType));
18639 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
18640 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18641 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
18642 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
18643 BV.getOperand(1).isUndef()
18644 ? DAG.getUNDEF(PreExtendVT)
18645 : BV.getOperand(1).getOperand(0),
18646 cast<ShuffleVectorSDNode>(BV)->getMask());
18647 }
18648 unsigned ExtOpc = !SeenZExtOrSExt
18650 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
18651 return DAG.getNode(ExtOpc, DL, VT, NBV);
18652}
18653
18654/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18655/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18657 // If the value type isn't a vector, none of the operands are going to be dups
18658 EVT VT = Mul->getValueType(0);
18659 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18660 return SDValue();
18661
18662 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
18663 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
18664
18665 // Neither operands have been changed, don't make any further changes
18666 if (!Op0 && !Op1)
18667 return SDValue();
18668
18669 SDLoc DL(Mul);
18670 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
18671 Op1 ? Op1 : Mul->getOperand(1));
18672}
18673
18674// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18675// Same for other types with equivalent constants.
18677 EVT VT = N->getValueType(0);
18678 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18679 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18680 return SDValue();
18681 if (N->getOperand(0).getOpcode() != ISD::AND ||
18682 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
18683 return SDValue();
18684
18685 SDValue And = N->getOperand(0);
18686 SDValue Srl = And.getOperand(0);
18687
18688 APInt V1, V2, V3;
18689 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
18690 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
18692 return SDValue();
18693
18694 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18695 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18696 V3 != (HalfSize - 1))
18697 return SDValue();
18698
18699 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18700 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
18701 VT.getVectorElementCount() * 2);
18702
18703 SDLoc DL(N);
18704 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
18705 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
18706 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
18707}
18708
18709// Transform vector add(zext i8 to i32, zext i8 to i32)
18710// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18711// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18712// extends.
18714 EVT VT = N->getValueType(0);
18715 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18716 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
18717 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
18718 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
18719 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
18720 N->getOperand(0).getOperand(0).getValueType() !=
18721 N->getOperand(1).getOperand(0).getValueType())
18722 return SDValue();
18723
18724 if (N->getOpcode() == ISD::MUL &&
18725 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
18726 return SDValue();
18727
18728 SDValue N0 = N->getOperand(0).getOperand(0);
18729 SDValue N1 = N->getOperand(1).getOperand(0);
18730 EVT InVT = N0.getValueType();
18731
18732 EVT S1 = InVT.getScalarType();
18733 EVT S2 = VT.getScalarType();
18734 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18735 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18736 SDLoc DL(N);
18737 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18740 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
18741 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
18742 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
18743 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
18744 : (unsigned)ISD::SIGN_EXTEND,
18745 DL, VT, NewOp);
18746 }
18747 return SDValue();
18748}
18749
18752 const AArch64Subtarget *Subtarget) {
18753
18754 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
18755 return Ext;
18757 return Ext;
18758 if (SDValue Ext = performVectorExtCombine(N, DAG))
18759 return Ext;
18760
18761 if (DCI.isBeforeLegalizeOps())
18762 return SDValue();
18763
18764 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18765 // and in MachineCombiner pass, add+mul will be combined into madd.
18766 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18767 SDLoc DL(N);
18768 EVT VT = N->getValueType(0);
18769 SDValue N0 = N->getOperand(0);
18770 SDValue N1 = N->getOperand(1);
18771 SDValue MulOper;
18772 unsigned AddSubOpc;
18773
18774 auto IsAddSubWith1 = [&](SDValue V) -> bool {
18775 AddSubOpc = V->getOpcode();
18776 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18777 SDValue Opnd = V->getOperand(1);
18778 MulOper = V->getOperand(0);
18779 if (AddSubOpc == ISD::SUB)
18780 std::swap(Opnd, MulOper);
18781 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18782 return C->isOne();
18783 }
18784 return false;
18785 };
18786
18787 if (IsAddSubWith1(N0)) {
18788 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18789 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18790 }
18791
18792 if (IsAddSubWith1(N1)) {
18793 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18794 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18795 }
18796
18797 // The below optimizations require a constant RHS.
18798 if (!isa<ConstantSDNode>(N1))
18799 return SDValue();
18800
18801 ConstantSDNode *C = cast<ConstantSDNode>(N1);
18802 const APInt &ConstValue = C->getAPIntValue();
18803
18804 // Allow the scaling to be folded into the `cnt` instruction by preventing
18805 // the scaling to be obscured here. This makes it easier to pattern match.
18806 if (IsSVECntIntrinsic(N0) ||
18807 (N0->getOpcode() == ISD::TRUNCATE &&
18808 (IsSVECntIntrinsic(N0->getOperand(0)))))
18809 if (ConstValue.sge(1) && ConstValue.sle(16))
18810 return SDValue();
18811
18812 // Multiplication of a power of two plus/minus one can be done more
18813 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18814 // future CPUs have a cheaper MADD instruction, this may need to be
18815 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18816 // 64-bit is 5 cycles, so this is always a win.
18817 // More aggressively, some multiplications N0 * C can be lowered to
18818 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18819 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18820 // TODO: lower more cases.
18821
18822 // TrailingZeroes is used to test if the mul can be lowered to
18823 // shift+add+shift.
18824 unsigned TrailingZeroes = ConstValue.countr_zero();
18825 if (TrailingZeroes) {
18826 // Conservatively do not lower to shift+add+shift if the mul might be
18827 // folded into smul or umul.
18828 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18829 isZeroExtended(N0, DAG)))
18830 return SDValue();
18831 // Conservatively do not lower to shift+add+shift if the mul might be
18832 // folded into madd or msub.
18833 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
18834 N->user_begin()->getOpcode() == ISD::SUB))
18835 return SDValue();
18836 }
18837 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18838 // and shift+add+shift.
18839 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18840 unsigned ShiftAmt;
18841
18842 auto Shl = [&](SDValue N0, unsigned N1) {
18843 if (!N0.getNode())
18844 return SDValue();
18845 // If shift causes overflow, ignore this combine.
18846 if (N1 >= N0.getValueSizeInBits())
18847 return SDValue();
18848 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18849 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18850 };
18851 auto Add = [&](SDValue N0, SDValue N1) {
18852 if (!N0.getNode() || !N1.getNode())
18853 return SDValue();
18854 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
18855 };
18856 auto Sub = [&](SDValue N0, SDValue N1) {
18857 if (!N0.getNode() || !N1.getNode())
18858 return SDValue();
18859 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
18860 };
18861 auto Negate = [&](SDValue N) {
18862 if (!N0.getNode())
18863 return SDValue();
18864 SDValue Zero = DAG.getConstant(0, DL, VT);
18865 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
18866 };
18867
18868 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18869 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18870 // the (2^N - 1) can't be execused via a single instruction.
18871 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18872 unsigned BitWidth = C.getBitWidth();
18873 for (unsigned i = 1; i < BitWidth / 2; i++) {
18874 APInt Rem;
18875 APInt X(BitWidth, (1 << i) + 1);
18876 APInt::sdivrem(C, X, N, Rem);
18877 APInt NVMinus1 = N - 1;
18878 if (Rem == 0 && NVMinus1.isPowerOf2()) {
18879 M = X;
18880 return true;
18881 }
18882 }
18883 return false;
18884 };
18885
18886 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18887 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18888 // the (2^N - 1) can't be execused via a single instruction.
18889 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18890 APInt CVMinus1 = C - 1;
18891 if (CVMinus1.isNegative())
18892 return false;
18893 unsigned TrailingZeroes = CVMinus1.countr_zero();
18894 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
18895 if (SCVMinus1.isPowerOf2()) {
18896 unsigned BitWidth = SCVMinus1.getBitWidth();
18897 M = APInt(BitWidth, SCVMinus1.logBase2());
18898 N = APInt(BitWidth, TrailingZeroes);
18899 return true;
18900 }
18901 return false;
18902 };
18903
18904 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18905 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18906 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18907 APInt CVMinus1 = C - 1;
18908 if (CVMinus1.isNegative())
18909 return false;
18910 unsigned TrailingZeroes = CVMinus1.countr_zero();
18911 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
18912 if (CVPlus1.isPowerOf2()) {
18913 unsigned BitWidth = CVPlus1.getBitWidth();
18914 M = APInt(BitWidth, CVPlus1.logBase2());
18915 N = APInt(BitWidth, TrailingZeroes);
18916 return true;
18917 }
18918 return false;
18919 };
18920
18921 if (ConstValue.isNonNegative()) {
18922 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18923 // (mul x, 2^N - 1) => (sub (shl x, N), x)
18924 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18925 // (mul x, (2^M + 1) * (2^N + 1))
18926 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18927 // (mul x, (2^M + 1) * 2^N + 1))
18928 // => MV = add (shl x, M), x); add (shl MV, N), x)
18929 // (mul x, 1 - (1 - 2^M) * 2^N))
18930 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18931 APInt SCVMinus1 = ShiftedConstValue - 1;
18932 APInt SCVPlus1 = ShiftedConstValue + 1;
18933 APInt CVPlus1 = ConstValue + 1;
18934 APInt CVM, CVN;
18935 if (SCVMinus1.isPowerOf2()) {
18936 ShiftAmt = SCVMinus1.logBase2();
18937 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18938 } else if (CVPlus1.isPowerOf2()) {
18939 ShiftAmt = CVPlus1.logBase2();
18940 return Sub(Shl(N0, ShiftAmt), N0);
18941 } else if (SCVPlus1.isPowerOf2()) {
18942 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18943 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18944 }
18945 if (Subtarget->hasALULSLFast() &&
18946 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18947 APInt CVMMinus1 = CVM - 1;
18948 APInt CVNMinus1 = CVN - 1;
18949 unsigned ShiftM1 = CVMMinus1.logBase2();
18950 unsigned ShiftN1 = CVNMinus1.logBase2();
18951 // ALULSLFast implicate that Shifts <= 4 places are fast
18952 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18953 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18954 return Add(Shl(MVal, ShiftN1), MVal);
18955 }
18956 }
18957 if (Subtarget->hasALULSLFast() &&
18958 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18959 unsigned ShiftM = CVM.getZExtValue();
18960 unsigned ShiftN = CVN.getZExtValue();
18961 // ALULSLFast implicate that Shifts <= 4 places are fast
18962 if (ShiftM <= 4 && ShiftN <= 4) {
18963 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18964 return Add(Shl(MVal, CVN.getZExtValue()), N0);
18965 }
18966 }
18967
18968 if (Subtarget->hasALULSLFast() &&
18969 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18970 unsigned ShiftM = CVM.getZExtValue();
18971 unsigned ShiftN = CVN.getZExtValue();
18972 // ALULSLFast implicate that Shifts <= 4 places are fast
18973 if (ShiftM <= 4 && ShiftN <= 4) {
18974 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18975 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18976 }
18977 }
18978 } else {
18979 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18980 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18981 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18982 APInt SCVPlus1 = -ShiftedConstValue + 1;
18983 APInt CVNegPlus1 = -ConstValue + 1;
18984 APInt CVNegMinus1 = -ConstValue - 1;
18985 if (CVNegPlus1.isPowerOf2()) {
18986 ShiftAmt = CVNegPlus1.logBase2();
18987 return Sub(N0, Shl(N0, ShiftAmt));
18988 } else if (CVNegMinus1.isPowerOf2()) {
18989 ShiftAmt = CVNegMinus1.logBase2();
18990 return Negate(Add(Shl(N0, ShiftAmt), N0));
18991 } else if (SCVPlus1.isPowerOf2()) {
18992 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18993 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
18994 }
18995 }
18996
18997 return SDValue();
18998}
18999
19001 SelectionDAG &DAG) {
19002 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19003 // optimize away operation when it's from a constant.
19004 //
19005 // The general transformation is:
19006 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19007 // AND(VECTOR_CMP(x,y), constant2)
19008 // constant2 = UNARYOP(constant)
19009
19010 // Early exit if this isn't a vector operation, the operand of the
19011 // unary operation isn't a bitwise AND, or if the sizes of the operations
19012 // aren't the same.
19013 EVT VT = N->getValueType(0);
19014 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19015 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19016 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19017 return SDValue();
19018
19019 // Now check that the other operand of the AND is a constant. We could
19020 // make the transformation for non-constant splats as well, but it's unclear
19021 // that would be a benefit as it would not eliminate any operations, just
19022 // perform one more step in scalar code before moving to the vector unit.
19023 if (BuildVectorSDNode *BV =
19024 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19025 // Bail out if the vector isn't a constant.
19026 if (!BV->isConstant())
19027 return SDValue();
19028
19029 // Everything checks out. Build up the new and improved node.
19030 SDLoc DL(N);
19031 EVT IntVT = BV->getValueType(0);
19032 // Create a new constant of the appropriate type for the transformed
19033 // DAG.
19034 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19035 // The AND node needs bitcasts to/from an integer vector type around it.
19036 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19037 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19038 N->getOperand(0)->getOperand(0), MaskConst);
19039 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19040 return Res;
19041 }
19042
19043 return SDValue();
19044}
19045
19046/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19047/// functions, this can help to reduce the number of fmovs to/from GPRs.
19048static SDValue
19051 const AArch64Subtarget *Subtarget) {
19052 if (N->isStrictFPOpcode())
19053 return SDValue();
19054
19055 if (DCI.isBeforeLegalizeOps())
19056 return SDValue();
19057
19058 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19059 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19060 return SDValue();
19061
19062 auto isSupportedType = [](EVT VT) {
19063 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19064 };
19065
19066 SDValue SrcVal = N->getOperand(0);
19067 EVT SrcTy = SrcVal.getValueType();
19068 EVT DestTy = N->getValueType(0);
19069
19070 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19071 return SDValue();
19072
19073 EVT SrcVecTy;
19074 EVT DestVecTy;
19075 if (DestTy.bitsGT(SrcTy)) {
19076 DestVecTy = getPackedSVEVectorVT(DestTy);
19077 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19078 } else {
19079 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19080 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19081 }
19082
19083 // Ensure the resulting src/dest vector type is legal.
19084 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19085 return SDValue();
19086
19087 SDLoc DL(N);
19088 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19089 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19090 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19091 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19092 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19093}
19094
19097 const AArch64Subtarget *Subtarget) {
19098 // First try to optimize away the conversion when it's conditionally from
19099 // a constant. Vectors only.
19101 return Res;
19102
19103 if (SDValue Res =
19104 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19105 return Res;
19106
19107 EVT VT = N->getValueType(0);
19108 if (VT != MVT::f32 && VT != MVT::f64)
19109 return SDValue();
19110
19111 // Only optimize when the source and destination types have the same width.
19112 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19113 return SDValue();
19114
19115 // If the result of an integer load is only used by an integer-to-float
19116 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19117 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19118 SDValue N0 = N->getOperand(0);
19119 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19120 N0.hasOneUse() &&
19121 // Do not change the width of a volatile load.
19122 !cast<LoadSDNode>(N0)->isVolatile()) {
19123 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19124 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19125 LN0->getPointerInfo(), LN0->getAlign(),
19126 LN0->getMemOperand()->getFlags());
19127
19128 // Make sure successors of the original load stay after it by updating them
19129 // to use the new Chain.
19130 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19131
19132 unsigned Opcode =
19134 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19135 }
19136
19137 return SDValue();
19138}
19139
19140/// Fold a floating-point multiply by power of two into floating-point to
19141/// fixed-point conversion.
19144 const AArch64Subtarget *Subtarget) {
19145 if (SDValue Res =
19146 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19147 return Res;
19148
19149 if (!Subtarget->isNeonAvailable())
19150 return SDValue();
19151
19152 if (!N->getValueType(0).isSimple())
19153 return SDValue();
19154
19155 SDValue Op = N->getOperand(0);
19156 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19157 return SDValue();
19158
19159 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19160 return SDValue();
19161
19162 SDValue ConstVec = Op->getOperand(1);
19163 if (!isa<BuildVectorSDNode>(ConstVec))
19164 return SDValue();
19165
19166 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19167 uint32_t FloatBits = FloatTy.getSizeInBits();
19168 if (FloatBits != 32 && FloatBits != 64 &&
19169 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19170 return SDValue();
19171
19172 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19173 uint32_t IntBits = IntTy.getSizeInBits();
19174 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19175 return SDValue();
19176
19177 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19178 if (IntBits > FloatBits)
19179 return SDValue();
19180
19181 BitVector UndefElements;
19182 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
19183 int32_t Bits = IntBits == 64 ? 64 : 32;
19184 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19185 if (C == -1 || C == 0 || C > Bits)
19186 return SDValue();
19187
19188 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19189 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19190 return SDValue();
19191
19192 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19193 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19194 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19195 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19196 return SDValue();
19197 }
19198
19199 SDLoc DL(N);
19200 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19201 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19202 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19203 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19204 SDValue FixConv =
19206 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19207 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19208 // We can handle smaller integers by generating an extra trunc.
19209 if (IntBits < FloatBits)
19210 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19211
19212 return FixConv;
19213}
19214
19216 const AArch64TargetLowering &TLI) {
19217 EVT VT = N->getValueType(0);
19218 SelectionDAG &DAG = DCI.DAG;
19219 SDLoc DL(N);
19220 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19221
19222 if (!VT.isVector())
19223 return SDValue();
19224
19225 if (VT.isScalableVector() && !Subtarget.hasSVE2())
19226 return SDValue();
19227
19228 if (VT.isFixedLengthVector() &&
19229 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
19230 return SDValue();
19231
19232 SDValue N0 = N->getOperand(0);
19233 if (N0.getOpcode() != ISD::AND)
19234 return SDValue();
19235
19236 SDValue N1 = N->getOperand(1);
19237 if (N1.getOpcode() != ISD::AND)
19238 return SDValue();
19239
19240 // InstCombine does (not (neg a)) => (add a -1).
19241 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
19242 // Loop over all combinations of AND operands.
19243 for (int i = 1; i >= 0; --i) {
19244 for (int j = 1; j >= 0; --j) {
19245 SDValue O0 = N0->getOperand(i);
19246 SDValue O1 = N1->getOperand(j);
19247 SDValue Sub, Add, SubSibling, AddSibling;
19248
19249 // Find a SUB and an ADD operand, one from each AND.
19250 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
19251 Sub = O0;
19252 Add = O1;
19253 SubSibling = N0->getOperand(1 - i);
19254 AddSibling = N1->getOperand(1 - j);
19255 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
19256 Add = O0;
19257 Sub = O1;
19258 AddSibling = N0->getOperand(1 - i);
19259 SubSibling = N1->getOperand(1 - j);
19260 } else
19261 continue;
19262
19264 continue;
19265
19266 // Constant ones is always righthand operand of the Add.
19267 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
19268 continue;
19269
19270 if (Sub.getOperand(1) != Add.getOperand(0))
19271 continue;
19272
19273 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
19274 }
19275 }
19276
19277 // (or (and a b) (and (not a) c)) => (bsl a b c)
19278 // We only have to look for constant vectors here since the general, variable
19279 // case can be handled in TableGen.
19280 unsigned Bits = VT.getScalarSizeInBits();
19281 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
19282 for (int i = 1; i >= 0; --i)
19283 for (int j = 1; j >= 0; --j) {
19284 APInt Val1, Val2;
19285
19286 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
19288 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
19289 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19290 N0->getOperand(1 - i), N1->getOperand(1 - j));
19291 }
19292 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
19293 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
19294 if (!BVN0 || !BVN1)
19295 continue;
19296
19297 bool FoundMatch = true;
19298 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
19299 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
19300 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
19301 if (!CN0 || !CN1 ||
19302 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
19303 FoundMatch = false;
19304 break;
19305 }
19306 }
19307 if (FoundMatch)
19308 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19309 N0->getOperand(1 - i), N1->getOperand(1 - j));
19310 }
19311
19312 return SDValue();
19313}
19314
19315// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19316// convert to csel(ccmp(.., cc0)), depending on cc1:
19317
19318// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19319// =>
19320// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19321//
19322// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19323// =>
19324// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19326 EVT VT = N->getValueType(0);
19327 SDValue CSel0 = N->getOperand(0);
19328 SDValue CSel1 = N->getOperand(1);
19329
19330 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19331 CSel1.getOpcode() != AArch64ISD::CSEL)
19332 return SDValue();
19333
19334 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19335 return SDValue();
19336
19337 if (!isNullConstant(CSel0.getOperand(0)) ||
19338 !isOneConstant(CSel0.getOperand(1)) ||
19339 !isNullConstant(CSel1.getOperand(0)) ||
19340 !isOneConstant(CSel1.getOperand(1)))
19341 return SDValue();
19342
19343 SDValue Cmp0 = CSel0.getOperand(3);
19344 SDValue Cmp1 = CSel1.getOperand(3);
19347 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19348 return SDValue();
19349 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19350 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19351 std::swap(Cmp0, Cmp1);
19352 std::swap(CC0, CC1);
19353 }
19354
19355 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19356 return SDValue();
19357
19358 SDLoc DL(N);
19359 SDValue CCmp, Condition;
19360 unsigned NZCV;
19361
19362 if (N->getOpcode() == ISD::AND) {
19364 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
19366 } else {
19368 Condition = DAG.getConstant(CC0, DL, MVT_CC);
19370 }
19371
19372 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19373
19374 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19375 if (Op1 && Op1->getAPIntValue().isNegative() &&
19376 Op1->getAPIntValue().sgt(-32)) {
19377 // CCMP accept the constant int the range [0, 31]
19378 // if the Op1 is a constant in the range [-31, -1], we
19379 // can select to CCMN to avoid the extra mov
19380 SDValue AbsOp1 =
19381 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19382 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
19383 NZCVOp, Condition, Cmp0);
19384 } else {
19385 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
19386 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19387 }
19388 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19389 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
19390 CCmp);
19391}
19392
19394 const AArch64Subtarget *Subtarget,
19395 const AArch64TargetLowering &TLI) {
19396 SelectionDAG &DAG = DCI.DAG;
19397 EVT VT = N->getValueType(0);
19398
19399 if (SDValue R = performANDORCSELCombine(N, DAG))
19400 return R;
19401
19402 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19403 return SDValue();
19404
19405 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
19406 return Res;
19407
19408 return SDValue();
19409}
19410
19412 if (!MemVT.getVectorElementType().isSimple())
19413 return false;
19414
19415 uint64_t MaskForTy = 0ull;
19416 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19417 case MVT::i8:
19418 MaskForTy = 0xffull;
19419 break;
19420 case MVT::i16:
19421 MaskForTy = 0xffffull;
19422 break;
19423 case MVT::i32:
19424 MaskForTy = 0xffffffffull;
19425 break;
19426 default:
19427 return false;
19428 break;
19429 }
19430
19431 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19432 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
19433 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19434
19435 return false;
19436}
19437
19439 SDValue LeafOp = SDValue(N, 0);
19440 SDValue Op = N->getOperand(0);
19441 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19442 LeafOp.getValueType() != Op.getValueType())
19443 Op = Op->getOperand(0);
19444 if (LeafOp.getValueType() == Op.getValueType())
19445 return Op;
19446 return SDValue();
19447}
19448
19451 SelectionDAG &DAG = DCI.DAG;
19452 SDValue Src = N->getOperand(0);
19453 unsigned Opc = Src->getOpcode();
19454
19455 // Zero/any extend of an unsigned unpack
19456 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19457 SDValue UnpkOp = Src->getOperand(0);
19458 SDValue Dup = N->getOperand(1);
19459
19460 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19461 return SDValue();
19462
19463 SDLoc DL(N);
19464 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
19465 if (!C)
19466 return SDValue();
19467
19468 uint64_t ExtVal = C->getZExtValue();
19469
19470 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19471 return ((ExtVal == 0xFF && VT == MVT::i8) ||
19472 (ExtVal == 0xFFFF && VT == MVT::i16) ||
19473 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19474 };
19475
19476 // If the mask is fully covered by the unpack, we don't need to push
19477 // a new AND onto the operand
19478 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
19479 if (MaskAndTypeMatch(EltTy))
19480 return Src;
19481
19482 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19483 // to see if the mask is all-ones of size MemTy.
19484 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
19485 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19486 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19487 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19488 if (MaskAndTypeMatch(EltTy))
19489 return Src;
19490 }
19491
19492 // Truncate to prevent a DUP with an over wide constant
19493 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
19494
19495 // Otherwise, make sure we propagate the AND to the operand
19496 // of the unpack
19497 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
19498 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
19499
19500 SDValue And = DAG.getNode(ISD::AND, DL,
19501 UnpkOp->getValueType(0), UnpkOp, Dup);
19502
19503 return DAG.getNode(Opc, DL, N->getValueType(0), And);
19504 }
19505
19506 if (DCI.isBeforeLegalizeOps())
19507 return SDValue();
19508
19509 // If both sides of AND operations are i1 splat_vectors then
19510 // we can produce just i1 splat_vector as the result.
19511 if (isAllActivePredicate(DAG, N->getOperand(0)))
19512 return N->getOperand(1);
19513 if (isAllActivePredicate(DAG, N->getOperand(1)))
19514 return N->getOperand(0);
19515
19517 return SDValue();
19518
19519 SDValue Mask = N->getOperand(1);
19520
19521 if (!Src.hasOneUse())
19522 return SDValue();
19523
19524 EVT MemVT;
19525
19526 // SVE load instructions perform an implicit zero-extend, which makes them
19527 // perfect candidates for combining.
19528 switch (Opc) {
19532 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
19533 break;
19549 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
19550 break;
19551 default:
19552 return SDValue();
19553 }
19554
19555 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
19556 return Src;
19557
19558 return SDValue();
19559}
19560
19561// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19564
19565 // This function performs an optimization on a specific pattern involving
19566 // an AND operation and SETCC (Set Condition Code) node.
19567
19568 SDValue SetCC = N->getOperand(0);
19569 EVT VT = N->getValueType(0);
19570 SelectionDAG &DAG = DCI.DAG;
19571
19572 // Checks if the current node (N) is used by any SELECT instruction and
19573 // returns an empty SDValue to avoid applying the optimization to prevent
19574 // incorrect results
19575 for (auto U : N->users())
19576 if (U->getOpcode() == ISD::SELECT)
19577 return SDValue();
19578
19579 // Check if the operand is a SETCC node with floating-point comparison
19580 if (SetCC.getOpcode() == ISD::SETCC &&
19581 SetCC.getOperand(0).getValueType() == MVT::f32) {
19582
19583 SDValue Cmp;
19585
19586 // Check if the DAG is after legalization and if we can emit the conjunction
19587 if (!DCI.isBeforeLegalize() &&
19588 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19589
19591
19592 SDLoc DL(N);
19593 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
19594 DAG.getConstant(0, DL, VT),
19595 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
19596 }
19597 }
19598 return SDValue();
19599}
19600
19603 SelectionDAG &DAG = DCI.DAG;
19604 SDValue LHS = N->getOperand(0);
19605 SDValue RHS = N->getOperand(1);
19606 EVT VT = N->getValueType(0);
19607
19608 if (SDValue R = performANDORCSELCombine(N, DAG))
19609 return R;
19610
19611 if (SDValue R = performANDSETCCCombine(N,DCI))
19612 return R;
19613
19614 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19615 return SDValue();
19616
19617 if (VT.isScalableVector())
19618 return performSVEAndCombine(N, DCI);
19619
19620 // The combining code below works only for NEON vectors. In particular, it
19621 // does not work for SVE when dealing with vectors wider than 128 bits.
19622 if (!VT.is64BitVector() && !VT.is128BitVector())
19623 return SDValue();
19624
19625 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
19626 if (!BVN)
19627 return SDValue();
19628
19629 // AND does not accept an immediate, so check if we can use a BIC immediate
19630 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19631 // pattern in isel, because some immediates may be lowered to the preferred
19632 // (and x, (movi imm)) form, even though an mvni representation also exists.
19633 APInt DefBits(VT.getSizeInBits(), 0);
19634 APInt UndefBits(VT.getSizeInBits(), 0);
19635 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
19636 SDValue NewOp;
19637
19638 // Any bits known to already be 0 need not be cleared again, which can help
19639 // reduce the size of the immediate to one supported by the instruction.
19640 KnownBits Known = DAG.computeKnownBits(LHS);
19641 APInt ZeroSplat(VT.getSizeInBits(), 0);
19642 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19643 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
19644 << (Known.Zero.getBitWidth() * I);
19645
19646 DefBits = ~(DefBits | ZeroSplat);
19647 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19648 DefBits, &LHS)) ||
19649 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19650 DefBits, &LHS)))
19651 return NewOp;
19652
19653 UndefBits = ~(UndefBits | ZeroSplat);
19654 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19655 UndefBits, &LHS)) ||
19656 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19657 UndefBits, &LHS)))
19658 return NewOp;
19659 }
19660
19661 return SDValue();
19662}
19663
19666 SelectionDAG &DAG = DCI.DAG;
19667 SDValue LHS = N->getOperand(0);
19668 SDValue RHS = N->getOperand(1);
19669 EVT VT = N->getValueType(0);
19670 SDLoc DL(N);
19671
19672 if (!N->getFlags().hasAllowReassociation())
19673 return SDValue();
19674
19675 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19676 auto ReassocComplex = [&](SDValue A, SDValue B) {
19677 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19678 return SDValue();
19679 unsigned Opc = A.getConstantOperandVal(0);
19680 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19681 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19682 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19683 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19684 return SDValue();
19685 SDValue VCMLA = DAG.getNode(
19686 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
19687 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
19688 A.getOperand(2), A.getOperand(3));
19689 VCMLA->setFlags(A->getFlags());
19690 return VCMLA;
19691 };
19692 if (SDValue R = ReassocComplex(LHS, RHS))
19693 return R;
19694 if (SDValue R = ReassocComplex(RHS, LHS))
19695 return R;
19696
19697 return SDValue();
19698}
19699
19700static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19701 switch (Opcode) {
19702 case ISD::STRICT_FADD:
19703 case ISD::FADD:
19704 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19705 case ISD::ADD:
19706 return VT == MVT::i64;
19707 default:
19708 return false;
19709 }
19710}
19711
19712static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19714
19716 if ((N.getOpcode() == ISD::SETCC) ||
19717 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19718 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
19719 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
19720 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
19721 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
19722 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
19723 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
19724 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
19725 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
19726 // get_active_lane_mask is lowered to a whilelo instruction.
19727 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
19728 return true;
19729
19730 return false;
19731}
19732
19733// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19734// ... into: "ptrue p, all" + PTEST
19735static SDValue
19738 const AArch64Subtarget *Subtarget) {
19739 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19740 // Make sure PTEST can be legalised with illegal types.
19741 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19742 return SDValue();
19743
19744 SDValue N0 = N->getOperand(0);
19745 EVT VT = N0.getValueType();
19746
19747 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19748 !isNullConstant(N->getOperand(1)))
19749 return SDValue();
19750
19751 // Restricted the DAG combine to only cases where we're extracting from a
19752 // flag-setting operation.
19753 if (!isPredicateCCSettingOp(N0))
19754 return SDValue();
19755
19756 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19757 SelectionDAG &DAG = DCI.DAG;
19758 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
19759 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
19760}
19761
19762// Materialize : Idx = (add (mul vscale, NumEls), -1)
19763// i1 = extract_vector_elt t37, Constant:i64<Idx>
19764// ... into: "ptrue p, all" + PTEST
19765static SDValue
19768 const AArch64Subtarget *Subtarget) {
19769 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19770 // Make sure PTEST is legal types.
19771 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19772 return SDValue();
19773
19774 SDValue N0 = N->getOperand(0);
19775 EVT OpVT = N0.getValueType();
19776
19777 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19778 return SDValue();
19779
19780 // Idx == (add (mul vscale, NumEls), -1)
19781 SDValue Idx = N->getOperand(1);
19782 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
19783 return SDValue();
19784
19785 SDValue VS = Idx.getOperand(0);
19786 if (VS.getOpcode() != ISD::VSCALE)
19787 return SDValue();
19788
19789 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19790 if (VS.getConstantOperandVal(0) != NumEls)
19791 return SDValue();
19792
19793 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19794 SelectionDAG &DAG = DCI.DAG;
19795 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
19796 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
19797}
19798
19799static SDValue
19801 const AArch64Subtarget *Subtarget) {
19802 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19803 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19804 return Res;
19805 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19806 return Res;
19807
19808 SelectionDAG &DAG = DCI.DAG;
19809 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19810
19811 EVT VT = N->getValueType(0);
19812 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19813 bool IsStrict = N0->isStrictFPOpcode();
19814
19815 // extract(dup x) -> x
19816 if (N0.getOpcode() == AArch64ISD::DUP)
19817 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
19818 : N0.getOperand(0);
19819
19820 // Rewrite for pairwise fadd pattern
19821 // (f32 (extract_vector_elt
19822 // (fadd (vXf32 Other)
19823 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19824 // ->
19825 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19826 // (extract_vector_elt (vXf32 Other) 1))
19827 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19828 // we can only do this when it's used only by the extract_vector_elt.
19829 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
19830 (!IsStrict || N0.hasOneUse())) {
19831 SDLoc DL(N0);
19832 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
19833 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
19834
19835 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
19836 SDValue Other = N00;
19837
19838 // And handle the commutative case.
19839 if (!Shuffle) {
19840 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
19841 Other = N01;
19842 }
19843
19844 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
19845 Other == Shuffle->getOperand(0)) {
19846 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19847 DAG.getConstant(0, DL, MVT::i64));
19848 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19849 DAG.getConstant(1, DL, MVT::i64));
19850 if (!IsStrict)
19851 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
19852
19853 // For strict_fadd we need uses of the final extract_vector to be replaced
19854 // with the strict_fadd, but we also need uses of the chain output of the
19855 // original strict_fadd to use the chain output of the new strict_fadd as
19856 // otherwise it may not be deleted.
19857 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
19858 {VT, MVT::Other},
19859 {N0->getOperand(0), Extract1, Extract2});
19860 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
19861 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
19862 return SDValue(N, 0);
19863 }
19864 }
19865
19866 return SDValue();
19867}
19868
19871 SelectionDAG &DAG) {
19872 SDLoc dl(N);
19873 EVT VT = N->getValueType(0);
19874 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19875 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19876
19877 if (VT.isScalableVector())
19878 return SDValue();
19879
19880 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19881 N1Opc == ISD::TRUNCATE) {
19882 SDValue N00 = N0->getOperand(0);
19883 SDValue N10 = N1->getOperand(0);
19884 EVT N00VT = N00.getValueType();
19885 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
19886
19887 // Optimize concat_vectors of truncated vectors, where the intermediate
19888 // type is illegal, to avoid said illegality, e.g.,
19889 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19890 // (v2i16 (truncate (v2i64)))))
19891 // ->
19892 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19893 // (v4i32 (bitcast (v2i64))),
19894 // <0, 2, 4, 6>)))
19895 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19896 // on both input and result type, so we might generate worse code.
19897 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19898 if (N00VT == N10.getValueType() &&
19899 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19900 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19901 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19903 for (size_t i = 0; i < Mask.size(); ++i)
19904 Mask[i] = i * 2;
19905 return DAG.getNode(ISD::TRUNCATE, dl, VT,
19906 DAG.getVectorShuffle(
19907 MidVT, dl,
19908 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
19909 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
19910 }
19911
19912 // Optimize two large shifts and a combine into a single combine and shift
19913 // For AArch64 architectures, sequences like the following:
19914 //
19915 // ushr v0.4s, v0.4s, #20
19916 // ushr v1.4s, v1.4s, #20
19917 // uzp1 v0.8h, v0.8h, v1.8h
19918 //
19919 // Can be optimized to:
19920 //
19921 // uzp2 v0.8h, v0.8h, v1.8h
19922 // ushr v0.8h, v0.8h, #4
19923 //
19924 // This optimization reduces instruction count.
19925 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
19926 N00->getOperand(1) == N10->getOperand(1)) {
19927 SDValue N000 = N00->getOperand(0);
19928 SDValue N100 = N10->getOperand(0);
19929 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
19930 N101ConstVal = N10->getConstantOperandVal(1),
19931 NScalarSize = N->getValueType(0).getScalarSizeInBits();
19932
19933 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
19934 N000 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N000);
19935 N100 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N100);
19936 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
19937 SDValue NewShiftConstant =
19938 DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32);
19939
19940 return DAG.getNode(AArch64ISD::VLSHR, dl, VT, Uzp, NewShiftConstant);
19941 }
19942 }
19943 }
19944
19945 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
19946 N->getOperand(0).getValueType() == MVT::v2i16 ||
19947 N->getOperand(0).getValueType() == MVT::v2i8) {
19948 EVT SrcVT = N->getOperand(0).getValueType();
19949 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19950 // loads to prevent having to go through the v4i8 load legalization that
19951 // needs to extend each element into a larger type.
19952 if (N->getNumOperands() % 2 == 0 &&
19953 all_of(N->op_values(), [SrcVT](SDValue V) {
19954 if (V.getValueType() != SrcVT)
19955 return false;
19956 if (V.isUndef())
19957 return true;
19958 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
19959 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19960 LD->getExtensionType() == ISD::NON_EXTLOAD;
19961 })) {
19962 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19963 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
19965
19966 for (unsigned i = 0; i < N->getNumOperands(); i++) {
19967 SDValue V = N->getOperand(i);
19968 if (V.isUndef())
19969 Ops.push_back(DAG.getUNDEF(FVT));
19970 else {
19971 LoadSDNode *LD = cast<LoadSDNode>(V);
19972 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
19973 LD->getBasePtr(), LD->getMemOperand());
19974 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
19975 Ops.push_back(NewLoad);
19976 }
19977 }
19978 return DAG.getBitcast(N->getValueType(0),
19979 DAG.getBuildVector(NVT, dl, Ops));
19980 }
19981 }
19982
19983 // Canonicalise concat_vectors to replace concatenations of truncated nots
19984 // with nots of concatenated truncates. This in some cases allows for multiple
19985 // redundant negations to be eliminated.
19986 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
19987 // (v4i16 (truncate (not (v4i32)))))
19988 // ->
19989 // (not (concat_vectors (v4i16 (truncate (v4i32))),
19990 // (v4i16 (truncate (v4i32)))))
19991 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19992 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
19993 N->isOnlyUserOf(N1.getNode())) {
19994 auto isBitwiseVectorNegate = [](SDValue V) {
19995 return V->getOpcode() == ISD::XOR &&
19996 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
19997 };
19998 SDValue N00 = N0->getOperand(0);
19999 SDValue N10 = N1->getOperand(0);
20000 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20001 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20002 return DAG.getNOT(
20003 dl,
20004 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20005 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
20006 N00->getOperand(0)),
20007 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
20008 N10->getOperand(0))),
20009 VT);
20010 }
20011 }
20012
20013 // Wait till after everything is legalized to try this. That way we have
20014 // legal vector types and such.
20015 if (DCI.isBeforeLegalizeOps())
20016 return SDValue();
20017
20018 // Optimise concat_vectors of two identical binops with a 128-bit destination
20019 // size, combine into an binop of two contacts of the source vectors. eg:
20020 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20021 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20022 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
20023 N1->hasOneUse()) {
20024 SDValue N00 = N0->getOperand(0);
20025 SDValue N01 = N0->getOperand(1);
20026 SDValue N10 = N1->getOperand(0);
20027 SDValue N11 = N1->getOperand(1);
20028
20029 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20030 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
20031 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
20032 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
20033 }
20034 }
20035
20036 auto IsRSHRN = [](SDValue Shr) {
20037 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20038 return false;
20039 SDValue Op = Shr.getOperand(0);
20040 EVT VT = Op.getValueType();
20041 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20042 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20043 return false;
20044
20045 APInt Imm;
20046 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20047 Imm = APInt(VT.getScalarSizeInBits(),
20048 Op.getOperand(1).getConstantOperandVal(0)
20049 << Op.getOperand(1).getConstantOperandVal(1));
20050 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20051 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20052 Imm = APInt(VT.getScalarSizeInBits(),
20053 Op.getOperand(1).getConstantOperandVal(0));
20054 else
20055 return false;
20056
20057 if (Imm != 1ULL << (ShtAmt - 1))
20058 return false;
20059 return true;
20060 };
20061
20062 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20063 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20064 ((IsRSHRN(N1) &&
20066 N1.isUndef())) {
20067 SDValue X = N0.getOperand(0).getOperand(0);
20068 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20069 : N1.getOperand(0).getOperand(0);
20070 EVT BVT =
20071 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20072 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
20073 SDValue Add = DAG.getNode(
20074 ISD::ADD, dl, BVT, CC,
20075 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
20076 SDValue Shr =
20077 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
20078 return Shr;
20079 }
20080
20081 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20082 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20083 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20084 N0.getOperand(1) == N1.getOperand(1)) {
20085 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
20086 DAG.getUNDEF(N0.getValueType()));
20087 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
20088 DAG.getUNDEF(N0.getValueType()));
20089 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
20090 }
20091
20092 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20093 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20094 // canonicalise to that.
20095 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20096 assert(VT.getScalarSizeInBits() == 64);
20097 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
20098 DAG.getConstant(0, dl, MVT::i64));
20099 }
20100
20101 // Canonicalise concat_vectors so that the right-hand vector has as few
20102 // bit-casts as possible before its real operation. The primary matching
20103 // destination for these operations will be the narrowing "2" instructions,
20104 // which depend on the operation being performed on this right-hand vector.
20105 // For example,
20106 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20107 // becomes
20108 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20109
20110 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20111 return SDValue();
20112 SDValue RHS = N1->getOperand(0);
20113 MVT RHSTy = RHS.getValueType().getSimpleVT();
20114 // If the RHS is not a vector, this is not the pattern we're looking for.
20115 if (!RHSTy.isVector())
20116 return SDValue();
20117
20118 LLVM_DEBUG(
20119 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20120
20121 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20122 RHSTy.getVectorNumElements() * 2);
20123 return DAG.getNode(ISD::BITCAST, dl, VT,
20124 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
20125 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
20126 RHS));
20127}
20128
20129static SDValue
20131 SelectionDAG &DAG) {
20132 if (DCI.isBeforeLegalizeOps())
20133 return SDValue();
20134
20135 EVT VT = N->getValueType(0);
20136 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20137 return SDValue();
20138
20139 SDValue V = N->getOperand(0);
20140
20141 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20142 // blocks this combine because the non-const case requires custom lowering.
20143 //
20144 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20145 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20146 if (isa<ConstantSDNode>(V.getOperand(0)))
20147 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20148
20149 return SDValue();
20150}
20151
20152static SDValue
20154 SelectionDAG &DAG) {
20155 SDLoc DL(N);
20156 SDValue Vec = N->getOperand(0);
20157 SDValue SubVec = N->getOperand(1);
20158 uint64_t IdxVal = N->getConstantOperandVal(2);
20159 EVT VecVT = Vec.getValueType();
20160 EVT SubVT = SubVec.getValueType();
20161
20162 // Only do this for legal fixed vector types.
20163 if (!VecVT.isFixedLengthVector() ||
20164 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20165 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20166 return SDValue();
20167
20168 // Ignore widening patterns.
20169 if (IdxVal == 0 && Vec.isUndef())
20170 return SDValue();
20171
20172 // Subvector must be half the width and an "aligned" insertion.
20173 unsigned NumSubElts = SubVT.getVectorNumElements();
20174 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20175 (IdxVal != 0 && IdxVal != NumSubElts))
20176 return SDValue();
20177
20178 // Fold insert_subvector -> concat_vectors
20179 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20180 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20181 SDValue Lo, Hi;
20182 if (IdxVal == 0) {
20183 Lo = SubVec;
20184 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20185 DAG.getVectorIdxConstant(NumSubElts, DL));
20186 } else {
20187 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20188 DAG.getVectorIdxConstant(0, DL));
20189 Hi = SubVec;
20190 }
20191 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20192}
20193
20196 SelectionDAG &DAG) {
20197 // Wait until after everything is legalized to try this. That way we have
20198 // legal vector types and such.
20199 if (DCI.isBeforeLegalizeOps())
20200 return SDValue();
20201 // Transform a scalar conversion of a value from a lane extract into a
20202 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20203 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20204 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20205 //
20206 // The second form interacts better with instruction selection and the
20207 // register allocator to avoid cross-class register copies that aren't
20208 // coalescable due to a lane reference.
20209
20210 // Check the operand and see if it originates from a lane extract.
20211 SDValue Op1 = N->getOperand(1);
20213 return SDValue();
20214
20215 // Yep, no additional predication needed. Perform the transform.
20216 SDValue IID = N->getOperand(0);
20217 SDValue Shift = N->getOperand(2);
20218 SDValue Vec = Op1.getOperand(0);
20219 SDValue Lane = Op1.getOperand(1);
20220 EVT ResTy = N->getValueType(0);
20221 EVT VecResTy;
20222 SDLoc DL(N);
20223
20224 // The vector width should be 128 bits by the time we get here, even
20225 // if it started as 64 bits (the extract_vector handling will have
20226 // done so). Bail if it is not.
20227 if (Vec.getValueSizeInBits() != 128)
20228 return SDValue();
20229
20230 if (Vec.getValueType() == MVT::v4i32)
20231 VecResTy = MVT::v4f32;
20232 else if (Vec.getValueType() == MVT::v2i64)
20233 VecResTy = MVT::v2f64;
20234 else
20235 return SDValue();
20236
20237 SDValue Convert =
20238 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20239 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20240}
20241
20242// AArch64 high-vector "long" operations are formed by performing the non-high
20243// version on an extract_subvector of each operand which gets the high half:
20244//
20245// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20246//
20247// However, there are cases which don't have an extract_high explicitly, but
20248// have another operation that can be made compatible with one for free. For
20249// example:
20250//
20251// (dupv64 scalar) --> (extract_high (dup128 scalar))
20252//
20253// This routine does the actual conversion of such DUPs, once outer routines
20254// have determined that everything else is in order.
20255// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20256// similarly here.
20258 MVT VT = N.getSimpleValueType();
20259 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20260 N.getConstantOperandVal(1) == 0)
20261 N = N.getOperand(0);
20262
20263 switch (N.getOpcode()) {
20264 case AArch64ISD::DUP:
20269 case AArch64ISD::MOVI:
20275 break;
20276 default:
20277 // FMOV could be supported, but isn't very useful, as it would only occur
20278 // if you passed a bitcast' floating point immediate to an eligible long
20279 // integer op (addl, smull, ...).
20280 return SDValue();
20281 }
20282
20283 if (!VT.is64BitVector())
20284 return SDValue();
20285
20286 SDLoc DL(N);
20287 unsigned NumElems = VT.getVectorNumElements();
20288 if (N.getValueType().is64BitVector()) {
20289 MVT ElementTy = VT.getVectorElementType();
20290 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20291 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20292 }
20293
20294 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20295 DAG.getConstant(NumElems, DL, MVT::i64));
20296}
20297
20299 if (N.getOpcode() == ISD::BITCAST)
20300 N = N.getOperand(0);
20301 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20302 return false;
20303 if (N.getOperand(0).getValueType().isScalableVector())
20304 return false;
20305 return N.getConstantOperandAPInt(1) ==
20306 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20307}
20308
20309/// Helper structure to keep track of ISD::SET_CC operands.
20314};
20315
20316/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20318 const SDValue *Cmp;
20320};
20321
20322/// Helper structure to keep track of SetCC information.
20326};
20327
20328/// Helper structure to be able to read SetCC information. If set to
20329/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20330/// GenericSetCCInfo.
20334};
20335
20336/// Check whether or not \p Op is a SET_CC operation, either a generic or
20337/// an
20338/// AArch64 lowered one.
20339/// \p SetCCInfo is filled accordingly.
20340/// \post SetCCInfo is meanginfull only when this function returns true.
20341/// \return True when Op is a kind of SET_CC operation.
20343 // If this is a setcc, this is straight forward.
20344 if (Op.getOpcode() == ISD::SETCC) {
20345 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20346 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20347 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20348 SetCCInfo.IsAArch64 = false;
20349 return true;
20350 }
20351 // Otherwise, check if this is a matching csel instruction.
20352 // In other words:
20353 // - csel 1, 0, cc
20354 // - csel 0, 1, !cc
20355 if (Op.getOpcode() != AArch64ISD::CSEL)
20356 return false;
20357 // Set the information about the operands.
20358 // TODO: we want the operands of the Cmp not the csel
20359 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20360 SetCCInfo.IsAArch64 = true;
20361 SetCCInfo.Info.AArch64.CC =
20362 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20363
20364 // Check that the operands matches the constraints:
20365 // (1) Both operands must be constants.
20366 // (2) One must be 1 and the other must be 0.
20367 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20368 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20369
20370 // Check (1).
20371 if (!TValue || !FValue)
20372 return false;
20373
20374 // Check (2).
20375 if (!TValue->isOne()) {
20376 // Update the comparison when we are interested in !cc.
20377 std::swap(TValue, FValue);
20378 SetCCInfo.Info.AArch64.CC =
20380 }
20381 return TValue->isOne() && FValue->isZero();
20382}
20383
20384// Returns true if Op is setcc or zext of setcc.
20385static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20386 if (isSetCC(Op, Info))
20387 return true;
20388 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
20389 isSetCC(Op->getOperand(0), Info));
20390}
20391
20392// The folding we want to perform is:
20393// (add x, [zext] (setcc cc ...) )
20394// -->
20395// (csel x, (add x, 1), !cc ...)
20396//
20397// The latter will get matched to a CSINC instruction.
20399 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
20400 SDValue LHS = Op->getOperand(0);
20401 SDValue RHS = Op->getOperand(1);
20402 SetCCInfoAndKind InfoAndKind;
20403
20404 // If both operands are a SET_CC, then we don't want to perform this
20405 // folding and create another csel as this results in more instructions
20406 // (and higher register usage).
20407 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
20408 isSetCCOrZExtSetCC(RHS, InfoAndKind))
20409 return SDValue();
20410
20411 // If neither operand is a SET_CC, give up.
20412 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
20413 std::swap(LHS, RHS);
20414 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
20415 return SDValue();
20416 }
20417
20418 // FIXME: This could be generatized to work for FP comparisons.
20419 EVT CmpVT = InfoAndKind.IsAArch64
20420 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
20421 : InfoAndKind.Info.Generic.Opnd0->getValueType();
20422 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
20423 return SDValue();
20424
20425 SDValue CCVal;
20426 SDValue Cmp;
20427 SDLoc dl(Op);
20428 if (InfoAndKind.IsAArch64) {
20429 CCVal = DAG.getConstant(
20431 MVT::i32);
20432 Cmp = *InfoAndKind.Info.AArch64.Cmp;
20433 } else
20434 Cmp = getAArch64Cmp(
20435 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
20436 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
20437 dl);
20438
20439 EVT VT = Op->getValueType(0);
20440 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
20441 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
20442}
20443
20444// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
20446 EVT VT = N->getValueType(0);
20447 // Only scalar integer and vector types.
20448 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
20449 return SDValue();
20450
20451 SDValue LHS = N->getOperand(0);
20452 SDValue RHS = N->getOperand(1);
20453 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20454 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
20455 return SDValue();
20456
20457 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
20458 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
20459 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
20460 return SDValue();
20461
20462 SDValue Op1 = LHS->getOperand(0);
20463 SDValue Op2 = RHS->getOperand(0);
20464 EVT OpVT1 = Op1.getValueType();
20465 EVT OpVT2 = Op2.getValueType();
20466 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
20467 Op2.getOpcode() != AArch64ISD::UADDV ||
20468 OpVT1.getVectorElementType() != VT)
20469 return SDValue();
20470
20471 SDValue Val1 = Op1.getOperand(0);
20472 SDValue Val2 = Op2.getOperand(0);
20473 EVT ValVT = Val1->getValueType(0);
20474 SDLoc DL(N);
20475 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
20476 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
20477 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
20478 DAG.getConstant(0, DL, MVT::i64));
20479}
20480
20481/// Perform the scalar expression combine in the form of:
20482/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
20483/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
20485 EVT VT = N->getValueType(0);
20486 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
20487 return SDValue();
20488
20489 SDValue LHS = N->getOperand(0);
20490 SDValue RHS = N->getOperand(1);
20491
20492 // Handle commutivity.
20493 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20494 LHS.getOpcode() != AArch64ISD::CSNEG) {
20495 std::swap(LHS, RHS);
20496 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20497 LHS.getOpcode() != AArch64ISD::CSNEG) {
20498 return SDValue();
20499 }
20500 }
20501
20502 if (!LHS.hasOneUse())
20503 return SDValue();
20504
20505 AArch64CC::CondCode AArch64CC =
20506 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
20507
20508 // The CSEL should include a const one operand, and the CSNEG should include
20509 // One or NegOne operand.
20510 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
20511 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
20512 if (!CTVal || !CFVal)
20513 return SDValue();
20514
20515 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
20516 (CTVal->isOne() || CFVal->isOne())) &&
20517 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
20518 (CTVal->isOne() || CFVal->isAllOnes())))
20519 return SDValue();
20520
20521 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
20522 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
20523 !CFVal->isOne()) {
20524 std::swap(CTVal, CFVal);
20525 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20526 }
20527
20528 SDLoc DL(N);
20529 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
20530 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
20531 !CFVal->isAllOnes()) {
20532 APInt C = -1 * CFVal->getAPIntValue();
20533 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
20534 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
20535 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20536 }
20537
20538 // It might be neutral for larger constants, as the immediate need to be
20539 // materialized in a register.
20540 APInt ADDC = CTVal->getAPIntValue();
20541 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20542 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
20543 return SDValue();
20544
20545 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
20546 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
20547 "Unexpected constant value");
20548
20549 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
20550 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
20551 SDValue Cmp = LHS.getOperand(3);
20552
20553 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
20554}
20555
20556// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
20558 EVT VT = N->getValueType(0);
20559 if (N->getOpcode() != ISD::ADD)
20560 return SDValue();
20561
20562 SDValue Dot = N->getOperand(0);
20563 SDValue A = N->getOperand(1);
20564 // Handle commutivity
20565 auto isZeroDot = [](SDValue Dot) {
20566 return (Dot.getOpcode() == AArch64ISD::UDOT ||
20567 Dot.getOpcode() == AArch64ISD::SDOT) &&
20569 };
20570 if (!isZeroDot(Dot))
20571 std::swap(Dot, A);
20572 if (!isZeroDot(Dot))
20573 return SDValue();
20574
20575 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
20576 Dot.getOperand(2));
20577}
20578
20580 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
20581}
20582
20584 SDLoc DL(Op);
20585 EVT VT = Op.getValueType();
20586 SDValue Zero = DAG.getConstant(0, DL, VT);
20587 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
20588}
20589
20590// Try to fold
20591//
20592// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
20593//
20594// The folding helps csel to be matched with csneg without generating
20595// redundant neg instruction, which includes negation of the csel expansion
20596// of abs node lowered by lowerABS.
20598 if (!isNegatedInteger(SDValue(N, 0)))
20599 return SDValue();
20600
20601 SDValue CSel = N->getOperand(1);
20602 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
20603 return SDValue();
20604
20605 SDValue N0 = CSel.getOperand(0);
20606 SDValue N1 = CSel.getOperand(1);
20607
20608 // If both of them is not negations, it's not worth the folding as it
20609 // introduces two additional negations while reducing one negation.
20610 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
20611 return SDValue();
20612
20613 SDValue N0N = getNegatedInteger(N0, DAG);
20614 SDValue N1N = getNegatedInteger(N1, DAG);
20615
20616 SDLoc DL(N);
20617 EVT VT = CSel.getValueType();
20618 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
20619 CSel.getOperand(3));
20620}
20621
20622// The basic add/sub long vector instructions have variants with "2" on the end
20623// which act on the high-half of their inputs. They are normally matched by
20624// patterns like:
20625//
20626// (add (zeroext (extract_high LHS)),
20627// (zeroext (extract_high RHS)))
20628// -> uaddl2 vD, vN, vM
20629//
20630// However, if one of the extracts is something like a duplicate, this
20631// instruction can still be used profitably. This function puts the DAG into a
20632// more appropriate form for those patterns to trigger.
20635 SelectionDAG &DAG = DCI.DAG;
20636 if (DCI.isBeforeLegalizeOps())
20637 return SDValue();
20638
20639 MVT VT = N->getSimpleValueType(0);
20640 if (!VT.is128BitVector()) {
20641 if (N->getOpcode() == ISD::ADD)
20642 return performSetccAddFolding(N, DAG);
20643 return SDValue();
20644 }
20645
20646 // Make sure both branches are extended in the same way.
20647 SDValue LHS = N->getOperand(0);
20648 SDValue RHS = N->getOperand(1);
20649 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20650 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
20651 LHS.getOpcode() != RHS.getOpcode())
20652 return SDValue();
20653
20654 unsigned ExtType = LHS.getOpcode();
20655
20656 // It's not worth doing if at least one of the inputs isn't already an
20657 // extract, but we don't know which it'll be so we have to try both.
20658 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
20659 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
20660 if (!RHS.getNode())
20661 return SDValue();
20662
20663 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
20664 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
20665 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
20666 if (!LHS.getNode())
20667 return SDValue();
20668
20669 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
20670 }
20671
20672 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
20673}
20674
20675static bool isCMP(SDValue Op) {
20676 return Op.getOpcode() == AArch64ISD::SUBS &&
20677 !Op.getNode()->hasAnyUseOfValue(0);
20678}
20679
20680// (CSEL 1 0 CC Cond) => CC
20681// (CSEL 0 1 CC Cond) => !CC
20682static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20683 if (Op.getOpcode() != AArch64ISD::CSEL)
20684 return std::nullopt;
20685 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20686 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
20687 return std::nullopt;
20688 SDValue OpLHS = Op.getOperand(0);
20689 SDValue OpRHS = Op.getOperand(1);
20690 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
20691 return CC;
20692 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
20693 return getInvertedCondCode(CC);
20694
20695 return std::nullopt;
20696}
20697
20698// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20699// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20700static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
20701 SDValue CmpOp = Op->getOperand(2);
20702 if (!isCMP(CmpOp))
20703 return SDValue();
20704
20705 if (IsAdd) {
20706 if (!isOneConstant(CmpOp.getOperand(1)))
20707 return SDValue();
20708 } else {
20709 if (!isNullConstant(CmpOp.getOperand(0)))
20710 return SDValue();
20711 }
20712
20713 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
20714 auto CC = getCSETCondCode(CsetOp);
20715 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20716 return SDValue();
20717
20718 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
20719 Op->getOperand(0), Op->getOperand(1),
20720 CsetOp.getOperand(3));
20721}
20722
20723// (ADC x 0 cond) => (CINC x HS cond)
20725 SDValue LHS = N->getOperand(0);
20726 SDValue RHS = N->getOperand(1);
20727 SDValue Cond = N->getOperand(2);
20728
20729 if (!isNullConstant(RHS))
20730 return SDValue();
20731
20732 EVT VT = N->getValueType(0);
20733 SDLoc DL(N);
20734
20735 // (CINC x cc cond) <=> (CSINC x x !cc cond)
20736 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
20737 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
20738}
20739
20742 SelectionDAG &DAG) {
20743 SDLoc DL(N);
20744 EVT VT = N->getValueType(0);
20745
20747 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
20748 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
20749 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
20750 if (Elt0->getOpcode() == ISD::FP_ROUND &&
20751 Elt1->getOpcode() == ISD::FP_ROUND &&
20752 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20753 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20754 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
20756 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20757 // Constant index.
20758 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20759 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20760 Elt0->getOperand(0)->getOperand(0) ==
20761 Elt1->getOperand(0)->getOperand(0) &&
20762 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
20763 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
20764 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
20765 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20766 SDValue HighLanes;
20767 if (Elt2->getOpcode() == ISD::UNDEF &&
20768 Elt3->getOpcode() == ISD::UNDEF) {
20769 HighLanes = DAG.getUNDEF(MVT::v2f32);
20770 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
20771 Elt3->getOpcode() == ISD::FP_ROUND &&
20772 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
20773 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
20774 Elt2->getConstantOperandVal(1) ==
20775 Elt3->getConstantOperandVal(1) &&
20776 Elt2->getOperand(0)->getOpcode() ==
20778 Elt3->getOperand(0)->getOpcode() ==
20780 // Constant index.
20781 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
20782 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
20783 Elt2->getOperand(0)->getOperand(0) ==
20784 Elt3->getOperand(0)->getOperand(0) &&
20785 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
20786 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
20787 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
20788 HighLanes =
20789 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
20790 }
20791 if (HighLanes) {
20792 SDValue DoubleToSingleSticky =
20793 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
20794 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
20795 DoubleToSingleSticky, HighLanes);
20796 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
20797 Elt0->getOperand(1));
20798 }
20799 }
20800 }
20801 }
20802
20803 if (VT == MVT::v2f64) {
20804 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20805 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
20806 Elt1->getOpcode() == ISD::FP_EXTEND &&
20808 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20809 Elt0->getOperand(0)->getOperand(0) ==
20810 Elt1->getOperand(0)->getOperand(0) &&
20811 // Constant index.
20812 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20813 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20814 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
20815 Elt1->getOperand(0)->getConstantOperandVal(1) &&
20816 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20817 // ResultType's known minimum vector length.
20818 Elt0->getOperand(0)->getConstantOperandVal(1) %
20820 0) {
20821 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
20822 if (SrcVec.getValueType() == MVT::v4f16 ||
20823 SrcVec.getValueType() == MVT::v4bf16) {
20824 SDValue HalfToSingle =
20825 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
20826 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
20827 SDValue Extract = DAG.getNode(
20829 HalfToSingle, SubvectorIdx);
20830 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
20831 }
20832 }
20833 }
20834
20835 // A build vector of two extracted elements is equivalent to an
20836 // extract subvector where the inner vector is any-extended to the
20837 // extract_vector_elt VT.
20838 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20839 // (extract_elt_iXX_to_i32 vec Idx+1))
20840 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20841
20842 // For now, only consider the v2i32 case, which arises as a result of
20843 // legalization.
20844 if (VT != MVT::v2i32)
20845 return SDValue();
20846
20847 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20848 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20849 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20850 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20851 // Constant index.
20852 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20853 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20854 // Both EXTRACT_VECTOR_ELT from same vector...
20855 Elt0->getOperand(0) == Elt1->getOperand(0) &&
20856 // ... and contiguous. First element's index +1 == second element's index.
20857 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
20858 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20859 // ResultType's known minimum vector length.
20860 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
20861 SDValue VecToExtend = Elt0->getOperand(0);
20862 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
20863 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
20864 return SDValue();
20865
20866 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
20867
20868 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
20869 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
20870 SubvectorIdx);
20871 }
20872
20873 return SDValue();
20874}
20875
20878 SDLoc DL(N);
20879 EVT VT = N->getValueType(0);
20880 SDValue N0 = N->getOperand(0);
20881 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20882 N0.getOpcode() == AArch64ISD::DUP) {
20883 SDValue Op = N0.getOperand(0);
20884 if (VT.getScalarType() == MVT::i32 &&
20885 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
20886 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
20887 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
20888 }
20889
20890 // Performing the following combine produces a preferable form for ISEL.
20891 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
20893 N0.hasOneUse()) {
20894 SDValue Op = N0.getOperand(0);
20895 SDValue ExtractIndexNode = N0.getOperand(1);
20896 if (!isa<ConstantSDNode>(ExtractIndexNode))
20897 return SDValue();
20898
20899 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
20900 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
20901 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
20902 "Unexpected legalisation result!");
20903
20904 EVT SrcVectorType = Op.getValueType();
20905 // We also assume that SrcVectorType cannot be a V64 (see
20906 // LowerEXTRACT_VECTOR_ELT).
20907 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
20908 "Unexpected legalisation result!");
20909
20910 unsigned ExtractIndex =
20911 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
20912 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
20913
20914 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
20915 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
20916 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
20917 }
20918
20919 return SDValue();
20920}
20921
20922// Check an node is an extend or shift operand
20924 unsigned Opcode = N.getOpcode();
20925 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20926 EVT SrcVT;
20927 if (Opcode == ISD::SIGN_EXTEND_INREG)
20928 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
20929 else
20930 SrcVT = N.getOperand(0).getValueType();
20931
20932 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20933 } else if (Opcode == ISD::AND) {
20934 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
20935 if (!CSD)
20936 return false;
20937 uint64_t AndMask = CSD->getZExtValue();
20938 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20939 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20940 return isa<ConstantSDNode>(N.getOperand(1));
20941 }
20942
20943 return false;
20944}
20945
20946// (N - Y) + Z --> (Z - Y) + N
20947// when N is an extend or shift operand
20949 SelectionDAG &DAG) {
20950 auto IsOneUseExtend = [](SDValue N) {
20951 return N.hasOneUse() && isExtendOrShiftOperand(N);
20952 };
20953
20954 // DAGCombiner will revert the combination when Z is constant cause
20955 // dead loop. So don't enable the combination when Z is constant.
20956 // If Z is one use shift C, we also can't do the optimization.
20957 // It will falling to self infinite loop.
20958 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
20959 return SDValue();
20960
20961 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
20962 return SDValue();
20963
20964 SDValue Shift = SUB.getOperand(0);
20965 if (!IsOneUseExtend(Shift))
20966 return SDValue();
20967
20968 SDLoc DL(N);
20969 EVT VT = N->getValueType(0);
20970
20971 SDValue Y = SUB.getOperand(1);
20972 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
20973 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
20974}
20975
20977 SelectionDAG &DAG) {
20978 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20979 // commutative.
20980 if (N->getOpcode() != ISD::ADD)
20981 return SDValue();
20982
20983 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20984 // shifted register is only available for i32 and i64.
20985 EVT VT = N->getValueType(0);
20986 if (VT != MVT::i32 && VT != MVT::i64)
20987 return SDValue();
20988
20989 SDLoc DL(N);
20990 SDValue LHS = N->getOperand(0);
20991 SDValue RHS = N->getOperand(1);
20992
20993 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
20994 return Val;
20995 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
20996 return Val;
20997
20998 uint64_t LHSImm = 0, RHSImm = 0;
20999 // If both operand are shifted by imm and shift amount is not greater than 4
21000 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21001 // on RHS.
21002 //
21003 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21004 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21005 // with LSL (shift > 4). For the rest of processors, this is no-op for
21006 // performance or correctness.
21007 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
21008 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
21009 RHSImm > 4 && LHS.hasOneUse())
21010 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
21011
21012 return SDValue();
21013}
21014
21015// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21016// This reassociates it back to allow the creation of more mls instructions.
21018 if (N->getOpcode() != ISD::SUB)
21019 return SDValue();
21020
21021 SDValue Add = N->getOperand(1);
21022 SDValue X = N->getOperand(0);
21023 if (Add.getOpcode() != ISD::ADD)
21024 return SDValue();
21025
21026 if (!Add.hasOneUse())
21027 return SDValue();
21029 return SDValue();
21030
21031 SDValue M1 = Add.getOperand(0);
21032 SDValue M2 = Add.getOperand(1);
21033 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21034 M1.getOpcode() != AArch64ISD::UMULL)
21035 return SDValue();
21036 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21038 return SDValue();
21039
21040 EVT VT = N->getValueType(0);
21041 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21042 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21043}
21044
21045// Combine into mla/mls.
21046// This works on the patterns of:
21047// add v1, (mul v2, v3)
21048// sub v1, (mul v2, v3)
21049// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21050// It will transform the add/sub to a scalable version, so that we can
21051// make use of SVE's MLA/MLS that will be generated for that pattern
21052static SDValue
21054 SelectionDAG &DAG = DCI.DAG;
21055 // Make sure that the types are legal
21056 if (!DCI.isAfterLegalizeDAG())
21057 return SDValue();
21058 // Before using SVE's features, check first if it's available.
21059 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21060 return SDValue();
21061
21062 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21063 return SDValue();
21064
21065 if (!N->getValueType(0).isFixedLengthVector())
21066 return SDValue();
21067
21068 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21069 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21070 return SDValue();
21071
21072 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21073 return SDValue();
21074
21075 SDValue MulValue = Op1->getOperand(0);
21076 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21077 return SDValue();
21078
21079 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21080 return SDValue();
21081
21082 EVT ScalableVT = MulValue.getValueType();
21083 if (!ScalableVT.isScalableVector())
21084 return SDValue();
21085
21086 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21087 SDValue NewValue =
21088 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21089 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21090 };
21091
21092 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21093 return res;
21094 else if (N->getOpcode() == ISD::ADD)
21095 return performOpt(N->getOperand(1), N->getOperand(0));
21096
21097 return SDValue();
21098}
21099
21100// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21101// help, for example, to produce ssra from sshr+add.
21103 EVT VT = N->getValueType(0);
21104 if (VT != MVT::i64 ||
21105 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21106 return SDValue();
21107 SDValue Op0 = N->getOperand(0);
21108 SDValue Op1 = N->getOperand(1);
21109
21110 // At least one of the operands should be an extract, and the other should be
21111 // something that is easy to convert to v1i64 type (in this case a load).
21112 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21113 Op0.getOpcode() != ISD::LOAD)
21114 return SDValue();
21115 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21116 Op1.getOpcode() != ISD::LOAD)
21117 return SDValue();
21118
21119 SDLoc DL(N);
21120 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21121 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21122 Op0 = Op0.getOperand(0);
21123 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21124 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21125 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21126 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21127 Op1 = Op1.getOperand(0);
21128 } else
21129 return SDValue();
21130
21131 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21132 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21133 DAG.getConstant(0, DL, MVT::i64));
21134}
21135
21138 if (!BV->hasOneUse())
21139 return false;
21140 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21141 if (!Ld || !Ld->isSimple())
21142 return false;
21143 Loads.push_back(Ld);
21144 return true;
21145 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21147 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21148 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
21149 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
21150 return false;
21151 Loads.push_back(Ld);
21152 }
21153 return true;
21154 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21155 // Try to find a tree of shuffles and concats from how IR shuffles of loads
21156 // are lowered. Note that this only comes up because we do not always visit
21157 // operands before uses. After that is fixed this can be removed and in the
21158 // meantime this is fairly specific to the lowering we expect from IR.
21159 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21160 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21161 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21162 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21163 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21164 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21165 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21166 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21167 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21168 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21169 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
21170 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21171 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21172 B.getOperand(1).getNumOperands() != 4)
21173 return false;
21174 auto SV1 = cast<ShuffleVectorSDNode>(B);
21175 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
21176 int NumElts = B.getValueType().getVectorNumElements();
21177 int NumSubElts = NumElts / 4;
21178 for (int I = 0; I < NumSubElts; I++) {
21179 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21180 if (SV1->getMaskElt(I) != I ||
21181 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21182 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
21183 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
21184 return false;
21185 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21186 if (SV2->getMaskElt(I) != I ||
21187 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21188 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
21189 return false;
21190 }
21191 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
21192 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
21193 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
21194 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
21195 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21196 !Ld2->isSimple() || !Ld3->isSimple())
21197 return false;
21198 Loads.push_back(Ld0);
21199 Loads.push_back(Ld1);
21200 Loads.push_back(Ld2);
21201 Loads.push_back(Ld3);
21202 return true;
21203 }
21204 return false;
21205}
21206
21208 SelectionDAG &DAG,
21209 unsigned &NumSubLoads) {
21210 if (!Op0.hasOneUse() || !Op1.hasOneUse())
21211 return false;
21212
21213 SmallVector<LoadSDNode *> Loads0, Loads1;
21214 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21215 isLoadOrMultipleLoads(Op1, Loads1)) {
21216 if (NumSubLoads && Loads0.size() != NumSubLoads)
21217 return false;
21218 NumSubLoads = Loads0.size();
21219 return Loads0.size() == Loads1.size() &&
21220 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
21221 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
21222 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
21223 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
21224 Size / 8, 1);
21225 });
21226 }
21227
21228 if (Op0.getOpcode() != Op1.getOpcode())
21229 return false;
21230
21231 switch (Op0.getOpcode()) {
21232 case ISD::ADD:
21233 case ISD::SUB:
21235 DAG, NumSubLoads) &&
21237 DAG, NumSubLoads);
21238 case ISD::SIGN_EXTEND:
21239 case ISD::ANY_EXTEND:
21240 case ISD::ZERO_EXTEND:
21241 EVT XVT = Op0.getOperand(0).getValueType();
21242 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
21243 XVT.getScalarSizeInBits() != 32)
21244 return false;
21246 DAG, NumSubLoads);
21247 }
21248 return false;
21249}
21250
21251// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21252// into a single load of twice the size, that we extract the bottom part and top
21253// part so that the shl can use a shll2 instruction. The two loads in that
21254// example can also be larger trees of instructions, which are identical except
21255// for the leaves which are all loads offset from the LHS, including
21256// buildvectors of multiple loads. For example the RHS tree could be
21257// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21258// Whilst it can be common for the larger loads to replace LDP instructions
21259// (which doesn't gain anything on it's own), the larger loads can help create
21260// more efficient code, and in buildvectors prevent the need for ld1 lane
21261// inserts which can be slower than normal loads.
21263 EVT VT = N->getValueType(0);
21264 if (!VT.isFixedLengthVector() ||
21265 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
21266 VT.getScalarSizeInBits() != 64))
21267 return SDValue();
21268
21269 SDValue Other = N->getOperand(0);
21270 SDValue Shift = N->getOperand(1);
21271 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21272 std::swap(Shift, Other);
21273 APInt ShiftAmt;
21274 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
21275 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
21276 return SDValue();
21277
21278 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
21279 !ISD::isExtOpcode(Other.getOpcode()) ||
21280 Shift.getOperand(0).getOperand(0).getValueType() !=
21281 Other.getOperand(0).getValueType() ||
21282 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
21283 return SDValue();
21284
21285 SDValue Op0 = Other.getOperand(0);
21286 SDValue Op1 = Shift.getOperand(0).getOperand(0);
21287
21288 unsigned NumSubLoads = 0;
21289 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21290 return SDValue();
21291
21292 // Attempt to rule out some unprofitable cases using heuristics (some working
21293 // around suboptimal code generation), notably if the extend not be able to
21294 // use ushll2 instructions as the types are not large enough. Otherwise zip's
21295 // will need to be created which can increase the instruction count.
21296 unsigned NumElts = Op0.getValueType().getVectorNumElements();
21297 unsigned NumSubElts = NumElts / NumSubLoads;
21298 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
21299 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
21300 Op0.getValueType().getSizeInBits() < 128 &&
21302 return SDValue();
21303
21304 // Recreate the tree with the new combined loads.
21305 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
21306 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
21307 EVT DVT =
21309
21310 SmallVector<LoadSDNode *> Loads0, Loads1;
21311 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21312 isLoadOrMultipleLoads(Op1, Loads1)) {
21313 EVT LoadVT = EVT::getVectorVT(
21314 *DAG.getContext(), Op0.getValueType().getScalarType(),
21315 Op0.getValueType().getVectorNumElements() / Loads0.size());
21316 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21317
21318 SmallVector<SDValue> NewLoads;
21319 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
21320 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
21321 L0->getBasePtr(), L0->getPointerInfo(),
21322 L0->getOriginalAlign());
21323 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
21324 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
21325 NewLoads.push_back(Load);
21326 }
21327 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
21328 }
21329
21331 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
21332 Ops.push_back(GenCombinedTree(O0, O1, DAG));
21333 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
21334 };
21335 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
21336
21337 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
21338 int Hi = NumSubElts, Lo = 0;
21339 for (unsigned i = 0; i < NumSubLoads; i++) {
21340 for (unsigned j = 0; j < NumSubElts; j++) {
21341 LowMask[i * NumSubElts + j] = Lo++;
21342 HighMask[i * NumSubElts + j] = Hi++;
21343 }
21344 Lo += NumSubElts;
21345 Hi += NumSubElts;
21346 }
21347 SDLoc DL(N);
21348 SDValue Ext0, Ext1;
21349 // Extract the top and bottom lanes, then extend the result. Possibly extend
21350 // the result then extract the lanes if the two operands match as it produces
21351 // slightly smaller code.
21352 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
21354 NewOp, DAG.getConstant(0, DL, MVT::i64));
21355 SDValue SubH =
21356 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
21357 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21358 SDValue Extr0 =
21359 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
21360 SDValue Extr1 =
21361 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
21362 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
21363 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
21364 } else {
21366 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
21367 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21368 DAG.getConstant(0, DL, MVT::i64));
21369 SDValue SubH =
21370 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21371 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21372 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
21373 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
21374 }
21375 SDValue NShift =
21376 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
21377 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
21378}
21379
21382 // Try to change sum of two reductions.
21383 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
21384 return Val;
21385 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
21386 return Val;
21387 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
21388 return Val;
21389 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
21390 return Val;
21391 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
21392 return Val;
21394 return Val;
21395 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
21396 return Val;
21397 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
21398 return Val;
21399 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
21400 return Val;
21401
21402 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
21403 return Val;
21404
21405 return performAddSubLongCombine(N, DCI);
21406}
21407
21408// Massage DAGs which we can use the high-half "long" operations on into
21409// something isel will recognize better. E.g.
21410//
21411// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
21412// (aarch64_neon_umull (extract_high (v2i64 vec)))
21413// (extract_high (v2i64 (dup128 scalar)))))
21414//
21417 SelectionDAG &DAG) {
21418 if (DCI.isBeforeLegalizeOps())
21419 return SDValue();
21420
21421 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
21422 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
21423 assert(LHS.getValueType().is64BitVector() &&
21424 RHS.getValueType().is64BitVector() &&
21425 "unexpected shape for long operation");
21426
21427 // Either node could be a DUP, but it's not worth doing both of them (you'd
21428 // just as well use the non-high version) so look for a corresponding extract
21429 // operation on the other "wing".
21432 if (!RHS.getNode())
21433 return SDValue();
21436 if (!LHS.getNode())
21437 return SDValue();
21438 } else
21439 return SDValue();
21440
21441 if (IID == Intrinsic::not_intrinsic)
21442 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
21443
21444 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
21445 N->getOperand(0), LHS, RHS);
21446}
21447
21448static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21449 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
21450 unsigned ElemBits = ElemTy.getSizeInBits();
21451
21452 int64_t ShiftAmount;
21453 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
21454 APInt SplatValue, SplatUndef;
21455 unsigned SplatBitSize;
21456 bool HasAnyUndefs;
21457 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
21458 HasAnyUndefs, ElemBits) ||
21459 SplatBitSize != ElemBits)
21460 return SDValue();
21461
21462 ShiftAmount = SplatValue.getSExtValue();
21463 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
21464 ShiftAmount = CVN->getSExtValue();
21465 } else
21466 return SDValue();
21467
21468 // If the shift amount is zero, remove the shift intrinsic.
21469 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
21470 return N->getOperand(1);
21471
21472 unsigned Opcode;
21473 bool IsRightShift;
21474 switch (IID) {
21475 default:
21476 llvm_unreachable("Unknown shift intrinsic");
21477 case Intrinsic::aarch64_neon_sqshl:
21478 Opcode = AArch64ISD::SQSHL_I;
21479 IsRightShift = false;
21480 break;
21481 case Intrinsic::aarch64_neon_uqshl:
21482 Opcode = AArch64ISD::UQSHL_I;
21483 IsRightShift = false;
21484 break;
21485 case Intrinsic::aarch64_neon_srshl:
21486 Opcode = AArch64ISD::SRSHR_I;
21487 IsRightShift = true;
21488 break;
21489 case Intrinsic::aarch64_neon_urshl:
21490 Opcode = AArch64ISD::URSHR_I;
21491 IsRightShift = true;
21492 break;
21493 case Intrinsic::aarch64_neon_sqshlu:
21494 Opcode = AArch64ISD::SQSHLU_I;
21495 IsRightShift = false;
21496 break;
21497 case Intrinsic::aarch64_neon_sshl:
21498 case Intrinsic::aarch64_neon_ushl:
21499 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
21500 // left shift for positive shift amounts. For negative shifts we can use a
21501 // VASHR/VLSHR as appropiate.
21502 if (ShiftAmount < 0) {
21503 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
21505 ShiftAmount = -ShiftAmount;
21506 } else
21507 Opcode = AArch64ISD::VSHL;
21508 IsRightShift = false;
21509 break;
21510 }
21511
21512 EVT VT = N->getValueType(0);
21513 SDValue Op = N->getOperand(1);
21514 SDLoc dl(N);
21515 if (VT == MVT::i64) {
21516 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
21517 VT = MVT::v1i64;
21518 }
21519
21520 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
21521 Op = DAG.getNode(Opcode, dl, VT, Op,
21522 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
21523 if (N->getValueType(0) == MVT::i64)
21524 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
21525 DAG.getConstant(0, dl, MVT::i64));
21526 return Op;
21527 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
21528 Op = DAG.getNode(Opcode, dl, VT, Op,
21529 DAG.getConstant(ShiftAmount, dl, MVT::i32));
21530 if (N->getValueType(0) == MVT::i64)
21531 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
21532 DAG.getConstant(0, dl, MVT::i64));
21533 return Op;
21534 }
21535
21536 return SDValue();
21537}
21538
21539// The CRC32[BH] instructions ignore the high bits of their data operand. Since
21540// the intrinsics must be legal and take an i32, this means there's almost
21541// certainly going to be a zext in the DAG which we can eliminate.
21542static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
21543 SDValue AndN = N->getOperand(2);
21544 if (AndN.getOpcode() != ISD::AND)
21545 return SDValue();
21546
21547 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
21548 if (!CMask || CMask->getZExtValue() != Mask)
21549 return SDValue();
21550
21551 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
21552 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
21553}
21554
21556 SelectionDAG &DAG) {
21557 SDLoc dl(N);
21558 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
21559 DAG.getNode(Opc, dl,
21560 N->getOperand(1).getSimpleValueType(),
21561 N->getOperand(1)),
21562 DAG.getConstant(0, dl, MVT::i64));
21563}
21564
21566 SDLoc DL(N);
21567 SDValue Op1 = N->getOperand(1);
21568 SDValue Op2 = N->getOperand(2);
21569 EVT ScalarTy = Op2.getValueType();
21570 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21571 ScalarTy = MVT::i32;
21572
21573 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
21574 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
21575 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
21576 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
21577 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
21578 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
21579}
21580
21582 SDLoc dl(N);
21583 SDValue Scalar = N->getOperand(3);
21584 EVT ScalarTy = Scalar.getValueType();
21585
21586 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21587 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
21588
21589 SDValue Passthru = N->getOperand(1);
21590 SDValue Pred = N->getOperand(2);
21591 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
21592 Pred, Scalar, Passthru);
21593}
21594
21596 SDLoc dl(N);
21597 LLVMContext &Ctx = *DAG.getContext();
21598 EVT VT = N->getValueType(0);
21599
21600 assert(VT.isScalableVector() && "Expected a scalable vector.");
21601
21602 // Current lowering only supports the SVE-ACLE types.
21604 return SDValue();
21605
21606 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
21607 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
21608 EVT ByteVT =
21609 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
21610
21611 // Convert everything to the domain of EXT (i.e bytes).
21612 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
21613 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
21614 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
21615 DAG.getConstant(ElemSize, dl, MVT::i32));
21616
21617 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
21618 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
21619}
21620
21623 SelectionDAG &DAG) {
21624 if (DCI.isBeforeLegalize())
21625 return SDValue();
21626
21627 SDValue Comparator = N->getOperand(3);
21628 if (Comparator.getOpcode() == AArch64ISD::DUP ||
21629 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
21630 unsigned IID = getIntrinsicID(N);
21631 EVT VT = N->getValueType(0);
21632 EVT CmpVT = N->getOperand(2).getValueType();
21633 SDValue Pred = N->getOperand(1);
21634 SDValue Imm;
21635 SDLoc DL(N);
21636
21637 switch (IID) {
21638 default:
21639 llvm_unreachable("Called with wrong intrinsic!");
21640 break;
21641
21642 // Signed comparisons
21643 case Intrinsic::aarch64_sve_cmpeq_wide:
21644 case Intrinsic::aarch64_sve_cmpne_wide:
21645 case Intrinsic::aarch64_sve_cmpge_wide:
21646 case Intrinsic::aarch64_sve_cmpgt_wide:
21647 case Intrinsic::aarch64_sve_cmplt_wide:
21648 case Intrinsic::aarch64_sve_cmple_wide: {
21649 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21650 int64_t ImmVal = CN->getSExtValue();
21651 if (ImmVal >= -16 && ImmVal <= 15)
21652 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
21653 else
21654 return SDValue();
21655 }
21656 break;
21657 }
21658 // Unsigned comparisons
21659 case Intrinsic::aarch64_sve_cmphs_wide:
21660 case Intrinsic::aarch64_sve_cmphi_wide:
21661 case Intrinsic::aarch64_sve_cmplo_wide:
21662 case Intrinsic::aarch64_sve_cmpls_wide: {
21663 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21664 uint64_t ImmVal = CN->getZExtValue();
21665 if (ImmVal <= 127)
21666 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
21667 else
21668 return SDValue();
21669 }
21670 break;
21671 }
21672 }
21673
21674 if (!Imm)
21675 return SDValue();
21676
21677 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
21678 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
21679 N->getOperand(2), Splat, DAG.getCondCode(CC));
21680 }
21681
21682 return SDValue();
21683}
21684
21687 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21688
21689 SDLoc DL(Op);
21690 assert(Op.getValueType().isScalableVector() &&
21691 TLI.isTypeLegal(Op.getValueType()) &&
21692 "Expected legal scalable vector type!");
21693 assert(Op.getValueType() == Pg.getValueType() &&
21694 "Expected same type for PTEST operands");
21695
21696 // Ensure target specific opcodes are using legal type.
21697 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
21698 SDValue TVal = DAG.getConstant(1, DL, OutVT);
21699 SDValue FVal = DAG.getConstant(0, DL, OutVT);
21700
21701 // Ensure operands have type nxv16i1.
21702 if (Op.getValueType() != MVT::nxv16i1) {
21705 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
21706 else
21707 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
21708 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
21709 }
21710
21711 // Set condition code (CC) flags.
21712 SDValue Test = DAG.getNode(
21714 DL, MVT::i32, Pg, Op);
21715
21716 // Convert CC to integer based on requested condition.
21717 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21718 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
21719 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
21720 return DAG.getZExtOrTrunc(Res, DL, VT);
21721}
21722
21724 SelectionDAG &DAG) {
21725 SDLoc DL(N);
21726
21727 SDValue Pred = N->getOperand(1);
21728 SDValue VecToReduce = N->getOperand(2);
21729
21730 // NOTE: The integer reduction's result type is not always linked to the
21731 // operand's element type so we construct it from the intrinsic's result type.
21732 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
21733 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21734
21735 // SVE reductions set the whole vector register with the first element
21736 // containing the reduction result, which we'll now extract.
21737 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21738 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21739 Zero);
21740}
21741
21743 SelectionDAG &DAG) {
21744 SDLoc DL(N);
21745
21746 SDValue Pred = N->getOperand(1);
21747 SDValue VecToReduce = N->getOperand(2);
21748
21749 EVT ReduceVT = VecToReduce.getValueType();
21750 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21751
21752 // SVE reductions set the whole vector register with the first element
21753 // containing the reduction result, which we'll now extract.
21754 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21755 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21756 Zero);
21757}
21758
21760 SelectionDAG &DAG) {
21761 SDLoc DL(N);
21762
21763 SDValue Pred = N->getOperand(1);
21764 SDValue InitVal = N->getOperand(2);
21765 SDValue VecToReduce = N->getOperand(3);
21766 EVT ReduceVT = VecToReduce.getValueType();
21767
21768 // Ordered reductions use the first lane of the result vector as the
21769 // reduction's initial value.
21770 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21771 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
21772 DAG.getUNDEF(ReduceVT), InitVal, Zero);
21773
21774 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
21775
21776 // SVE reductions set the whole vector register with the first element
21777 // containing the reduction result, which we'll now extract.
21778 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21779 Zero);
21780}
21781
21782// If a merged operation has no inactive lanes we can relax it to a predicated
21783// or unpredicated operation, which potentially allows better isel (perhaps
21784// using immediate forms) or relaxing register reuse requirements.
21786 SelectionDAG &DAG, bool UnpredOp = false,
21787 bool SwapOperands = false) {
21788 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21789 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
21790 SDValue Pg = N->getOperand(1);
21791 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
21792 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
21793
21794 // ISD way to specify an all active predicate.
21795 if (isAllActivePredicate(DAG, Pg)) {
21796 if (UnpredOp)
21797 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
21798
21799 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
21800 }
21801
21802 // FUTURE: SplatVector(true)
21803 return SDValue();
21804}
21805
21808 const AArch64Subtarget *Subtarget) {
21809 if (DCI.isBeforeLegalize())
21810 return SDValue();
21811
21812 if (!Subtarget->hasSVE2p1())
21813 return SDValue();
21814
21815 if (!N->hasNUsesOfValue(2, 0))
21816 return SDValue();
21817
21818 const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
21819 if (HalfSize < 2)
21820 return SDValue();
21821
21822 auto It = N->user_begin();
21823 SDNode *Lo = *It++;
21824 SDNode *Hi = *It;
21825
21826 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
21827 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
21828 return SDValue();
21829
21830 uint64_t OffLo = Lo->getConstantOperandVal(1);
21831 uint64_t OffHi = Hi->getConstantOperandVal(1);
21832
21833 if (OffLo > OffHi) {
21834 std::swap(Lo, Hi);
21835 std::swap(OffLo, OffHi);
21836 }
21837
21838 if (OffLo != 0 || OffHi != HalfSize)
21839 return SDValue();
21840
21841 EVT HalfVec = Lo->getValueType(0);
21842 if (HalfVec != Hi->getValueType(0) ||
21843 HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
21844 return SDValue();
21845
21846 SelectionDAG &DAG = DCI.DAG;
21847 SDLoc DL(N);
21848 SDValue ID =
21849 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
21850 SDValue Idx = N->getOperand(1);
21851 SDValue TC = N->getOperand(2);
21852 if (Idx.getValueType() != MVT::i64) {
21853 Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
21854 TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
21855 }
21856 auto R =
21858 {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
21859
21860 DCI.CombineTo(Lo, R.getValue(0));
21861 DCI.CombineTo(Hi, R.getValue(1));
21862
21863 return SDValue(N, 0);
21864}
21865
21867 const AArch64Subtarget *Subtarget,
21868 SelectionDAG &DAG) {
21869
21870 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21871 getIntrinsicID(N) ==
21872 Intrinsic::experimental_vector_partial_reduce_add &&
21873 "Expected a partial reduction node");
21874
21875 bool Scalable = N->getValueType(0).isScalableVector();
21876 if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
21877 return SDValue();
21878 if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
21879 return SDValue();
21880
21881 SDLoc DL(N);
21882
21883 SDValue Op2 = N->getOperand(2);
21884 unsigned Op2Opcode = Op2->getOpcode();
21885 SDValue MulOpLHS, MulOpRHS;
21886 bool MulOpLHSIsSigned, MulOpRHSIsSigned;
21887 if (ISD::isExtOpcode(Op2Opcode)) {
21888 MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
21889 MulOpLHS = Op2->getOperand(0);
21890 MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
21891 } else if (Op2Opcode == ISD::MUL) {
21892 SDValue ExtMulOpLHS = Op2->getOperand(0);
21893 SDValue ExtMulOpRHS = Op2->getOperand(1);
21894
21895 unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
21896 unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
21897 if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
21898 !ISD::isExtOpcode(ExtMulOpRHSOpcode))
21899 return SDValue();
21900
21901 MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
21902 MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
21903
21904 MulOpLHS = ExtMulOpLHS->getOperand(0);
21905 MulOpRHS = ExtMulOpRHS->getOperand(0);
21906
21907 if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
21908 return SDValue();
21909 } else
21910 return SDValue();
21911
21912 SDValue Acc = N->getOperand(1);
21913 EVT ReducedVT = N->getValueType(0);
21914 EVT MulSrcVT = MulOpLHS.getValueType();
21915
21916 // Dot products operate on chunks of four elements so there must be four times
21917 // as many elements in the wide type
21918 if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
21919 !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
21920 !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
21921 !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
21922 !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
21923 !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
21924 return SDValue();
21925
21926 // If the extensions are mixed, we should lower it to a usdot instead
21927 unsigned Opcode = 0;
21928 if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
21929 if (!Subtarget->hasMatMulInt8())
21930 return SDValue();
21931
21932 bool Scalable = N->getValueType(0).isScalableVT();
21933 // There's no nxv2i64 version of usdot
21934 if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
21935 return SDValue();
21936
21937 Opcode = AArch64ISD::USDOT;
21938 // USDOT expects the signed operand to be last
21939 if (!MulOpRHSIsSigned)
21940 std::swap(MulOpLHS, MulOpRHS);
21941 } else
21942 Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
21943
21944 // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
21945 // product followed by a zero / sign extension
21946 if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
21947 (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
21948 EVT ReducedVTI32 =
21949 (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
21950
21951 SDValue DotI32 =
21952 DAG.getNode(Opcode, DL, ReducedVTI32,
21953 DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
21954 SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
21955 return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
21956 }
21957
21958 return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
21959}
21960
21962 const AArch64Subtarget *Subtarget,
21963 SelectionDAG &DAG) {
21964
21965 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21966 getIntrinsicID(N) ==
21967 Intrinsic::experimental_vector_partial_reduce_add &&
21968 "Expected a partial reduction node");
21969
21970 if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
21971 return SDValue();
21972
21973 SDLoc DL(N);
21974
21975 if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
21976 return SDValue();
21977 SDValue Acc = N->getOperand(1);
21978 SDValue Ext = N->getOperand(2);
21979 EVT AccVT = Acc.getValueType();
21980 EVT ExtVT = Ext.getValueType();
21981 if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
21982 return SDValue();
21983
21984 SDValue ExtOp = Ext->getOperand(0);
21985 EVT ExtOpVT = ExtOp.getValueType();
21986
21987 if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
21988 !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
21989 !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
21990 return SDValue();
21991
21992 bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
21993 unsigned BottomOpcode =
21994 ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
21995 unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
21996 SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
21997 return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
21998}
21999
22002 const AArch64Subtarget *Subtarget) {
22003 SelectionDAG &DAG = DCI.DAG;
22004 unsigned IID = getIntrinsicID(N);
22005 switch (IID) {
22006 default:
22007 break;
22008 case Intrinsic::experimental_vector_partial_reduce_add: {
22009 if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
22010 return Dot;
22011 if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
22012 return WideAdd;
22013 return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0),
22014 N->getOperand(1), N->getOperand(2));
22015 }
22016 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22017 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22018 return tryCombineFixedPointConvert(N, DCI, DAG);
22019 case Intrinsic::aarch64_neon_saddv:
22021 case Intrinsic::aarch64_neon_uaddv:
22023 case Intrinsic::aarch64_neon_sminv:
22025 case Intrinsic::aarch64_neon_uminv:
22027 case Intrinsic::aarch64_neon_smaxv:
22029 case Intrinsic::aarch64_neon_umaxv:
22031 case Intrinsic::aarch64_neon_fmax:
22032 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
22033 N->getOperand(1), N->getOperand(2));
22034 case Intrinsic::aarch64_neon_fmin:
22035 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22036 N->getOperand(1), N->getOperand(2));
22037 case Intrinsic::aarch64_neon_fmaxnm:
22038 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22039 N->getOperand(1), N->getOperand(2));
22040 case Intrinsic::aarch64_neon_fminnm:
22041 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22042 N->getOperand(1), N->getOperand(2));
22043 case Intrinsic::aarch64_neon_smull:
22044 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22045 N->getOperand(1), N->getOperand(2));
22046 case Intrinsic::aarch64_neon_umull:
22047 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22048 N->getOperand(1), N->getOperand(2));
22049 case Intrinsic::aarch64_neon_pmull:
22050 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22051 N->getOperand(1), N->getOperand(2));
22052 case Intrinsic::aarch64_neon_sqdmull:
22053 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22054 case Intrinsic::aarch64_neon_sqshl:
22055 case Intrinsic::aarch64_neon_uqshl:
22056 case Intrinsic::aarch64_neon_sqshlu:
22057 case Intrinsic::aarch64_neon_srshl:
22058 case Intrinsic::aarch64_neon_urshl:
22059 case Intrinsic::aarch64_neon_sshl:
22060 case Intrinsic::aarch64_neon_ushl:
22061 return tryCombineShiftImm(IID, N, DAG);
22062 case Intrinsic::aarch64_neon_sabd:
22063 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22064 N->getOperand(1), N->getOperand(2));
22065 case Intrinsic::aarch64_neon_uabd:
22066 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22067 N->getOperand(1), N->getOperand(2));
22068 case Intrinsic::aarch64_crc32b:
22069 case Intrinsic::aarch64_crc32cb:
22070 return tryCombineCRC32(0xff, N, DAG);
22071 case Intrinsic::aarch64_crc32h:
22072 case Intrinsic::aarch64_crc32ch:
22073 return tryCombineCRC32(0xffff, N, DAG);
22074 case Intrinsic::aarch64_sve_saddv:
22075 // There is no i64 version of SADDV because the sign is irrelevant.
22076 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
22078 else
22080 case Intrinsic::aarch64_sve_uaddv:
22082 case Intrinsic::aarch64_sve_smaxv:
22084 case Intrinsic::aarch64_sve_umaxv:
22086 case Intrinsic::aarch64_sve_sminv:
22088 case Intrinsic::aarch64_sve_uminv:
22090 case Intrinsic::aarch64_sve_orv:
22092 case Intrinsic::aarch64_sve_eorv:
22094 case Intrinsic::aarch64_sve_andv:
22096 case Intrinsic::aarch64_sve_index:
22097 return LowerSVEIntrinsicIndex(N, DAG);
22098 case Intrinsic::aarch64_sve_dup:
22099 return LowerSVEIntrinsicDUP(N, DAG);
22100 case Intrinsic::aarch64_sve_dup_x:
22101 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22102 N->getOperand(1));
22103 case Intrinsic::aarch64_sve_ext:
22104 return LowerSVEIntrinsicEXT(N, DAG);
22105 case Intrinsic::aarch64_sve_mul_u:
22106 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22107 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22108 case Intrinsic::aarch64_sve_smulh_u:
22109 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22110 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22111 case Intrinsic::aarch64_sve_umulh_u:
22112 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22113 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22114 case Intrinsic::aarch64_sve_smin_u:
22115 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22116 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22117 case Intrinsic::aarch64_sve_umin_u:
22118 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22119 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22120 case Intrinsic::aarch64_sve_smax_u:
22121 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22122 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22123 case Intrinsic::aarch64_sve_umax_u:
22124 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22125 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22126 case Intrinsic::aarch64_sve_lsl_u:
22127 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22128 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22129 case Intrinsic::aarch64_sve_lsr_u:
22130 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22131 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22132 case Intrinsic::aarch64_sve_asr_u:
22133 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22134 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22135 case Intrinsic::aarch64_sve_fadd_u:
22136 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22137 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22138 case Intrinsic::aarch64_sve_fdiv_u:
22139 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22140 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22141 case Intrinsic::aarch64_sve_fmax_u:
22142 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22143 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22144 case Intrinsic::aarch64_sve_fmaxnm_u:
22145 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22146 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22147 case Intrinsic::aarch64_sve_fmla_u:
22148 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22149 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22150 N->getOperand(2));
22151 case Intrinsic::aarch64_sve_fmin_u:
22152 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22153 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22154 case Intrinsic::aarch64_sve_fminnm_u:
22155 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22156 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22157 case Intrinsic::aarch64_sve_fmul_u:
22158 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22159 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22160 case Intrinsic::aarch64_sve_fsub_u:
22161 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22162 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22163 case Intrinsic::aarch64_sve_add_u:
22164 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22165 N->getOperand(3));
22166 case Intrinsic::aarch64_sve_sub_u:
22167 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22168 N->getOperand(3));
22169 case Intrinsic::aarch64_sve_subr:
22170 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22171 case Intrinsic::aarch64_sve_and_u:
22172 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22173 N->getOperand(3));
22174 case Intrinsic::aarch64_sve_bic_u:
22175 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22176 N->getOperand(2), N->getOperand(3));
22177 case Intrinsic::aarch64_sve_saddwb:
22178 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22179 N->getOperand(1), N->getOperand(2));
22180 case Intrinsic::aarch64_sve_saddwt:
22181 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22182 N->getOperand(1), N->getOperand(2));
22183 case Intrinsic::aarch64_sve_uaddwb:
22184 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22185 N->getOperand(1), N->getOperand(2));
22186 case Intrinsic::aarch64_sve_uaddwt:
22187 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22188 N->getOperand(1), N->getOperand(2));
22189 case Intrinsic::aarch64_sve_eor_u:
22190 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22191 N->getOperand(3));
22192 case Intrinsic::aarch64_sve_orr_u:
22193 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22194 N->getOperand(3));
22195 case Intrinsic::aarch64_sve_sabd_u:
22196 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22197 N->getOperand(2), N->getOperand(3));
22198 case Intrinsic::aarch64_sve_uabd_u:
22199 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22200 N->getOperand(2), N->getOperand(3));
22201 case Intrinsic::aarch64_sve_sdiv_u:
22202 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22203 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22204 case Intrinsic::aarch64_sve_udiv_u:
22205 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22206 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22207 case Intrinsic::aarch64_sve_sqadd:
22208 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22209 case Intrinsic::aarch64_sve_sqsub_u:
22210 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22211 N->getOperand(2), N->getOperand(3));
22212 case Intrinsic::aarch64_sve_uqadd:
22213 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22214 case Intrinsic::aarch64_sve_uqsub_u:
22215 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22216 N->getOperand(2), N->getOperand(3));
22217 case Intrinsic::aarch64_sve_sqadd_x:
22218 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22219 N->getOperand(1), N->getOperand(2));
22220 case Intrinsic::aarch64_sve_sqsub_x:
22221 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22222 N->getOperand(1), N->getOperand(2));
22223 case Intrinsic::aarch64_sve_uqadd_x:
22224 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22225 N->getOperand(1), N->getOperand(2));
22226 case Intrinsic::aarch64_sve_uqsub_x:
22227 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22228 N->getOperand(1), N->getOperand(2));
22229 case Intrinsic::aarch64_sve_asrd:
22230 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22231 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22232 case Intrinsic::aarch64_sve_cmphs:
22233 if (!N->getOperand(2).getValueType().isFloatingPoint())
22235 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22236 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22237 break;
22238 case Intrinsic::aarch64_sve_cmphi:
22239 if (!N->getOperand(2).getValueType().isFloatingPoint())
22241 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22242 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22243 break;
22244 case Intrinsic::aarch64_sve_fcmpge:
22245 case Intrinsic::aarch64_sve_cmpge:
22247 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22248 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22249 break;
22250 case Intrinsic::aarch64_sve_fcmpgt:
22251 case Intrinsic::aarch64_sve_cmpgt:
22253 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22254 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22255 break;
22256 case Intrinsic::aarch64_sve_fcmpeq:
22257 case Intrinsic::aarch64_sve_cmpeq:
22259 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22260 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22261 break;
22262 case Intrinsic::aarch64_sve_fcmpne:
22263 case Intrinsic::aarch64_sve_cmpne:
22265 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22266 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22267 break;
22268 case Intrinsic::aarch64_sve_fcmpuo:
22270 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22271 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22272 break;
22273 case Intrinsic::aarch64_sve_fadda:
22275 case Intrinsic::aarch64_sve_faddv:
22277 case Intrinsic::aarch64_sve_fmaxnmv:
22279 case Intrinsic::aarch64_sve_fmaxv:
22281 case Intrinsic::aarch64_sve_fminnmv:
22283 case Intrinsic::aarch64_sve_fminv:
22285 case Intrinsic::aarch64_sve_sel:
22286 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22287 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22288 case Intrinsic::aarch64_sve_cmpeq_wide:
22289 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
22290 case Intrinsic::aarch64_sve_cmpne_wide:
22291 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
22292 case Intrinsic::aarch64_sve_cmpge_wide:
22293 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
22294 case Intrinsic::aarch64_sve_cmpgt_wide:
22295 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
22296 case Intrinsic::aarch64_sve_cmplt_wide:
22297 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
22298 case Intrinsic::aarch64_sve_cmple_wide:
22299 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
22300 case Intrinsic::aarch64_sve_cmphs_wide:
22301 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
22302 case Intrinsic::aarch64_sve_cmphi_wide:
22303 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
22304 case Intrinsic::aarch64_sve_cmplo_wide:
22305 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
22306 case Intrinsic::aarch64_sve_cmpls_wide:
22307 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
22308 case Intrinsic::aarch64_sve_ptest_any:
22309 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22311 case Intrinsic::aarch64_sve_ptest_first:
22312 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22314 case Intrinsic::aarch64_sve_ptest_last:
22315 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22317 case Intrinsic::aarch64_sve_whilelo:
22318 return tryCombineWhileLo(N, DCI, Subtarget);
22319 }
22320 return SDValue();
22321}
22322
22323static bool isCheapToExtend(const SDValue &N) {
22324 unsigned OC = N->getOpcode();
22325 return OC == ISD::LOAD || OC == ISD::MLOAD ||
22327}
22328
22329static SDValue
22331 SelectionDAG &DAG) {
22332 // If we have (sext (setcc A B)) and A and B are cheap to extend,
22333 // we can move the sext into the arguments and have the same result. For
22334 // example, if A and B are both loads, we can make those extending loads and
22335 // avoid an extra instruction. This pattern appears often in VLS code
22336 // generation where the inputs to the setcc have a different size to the
22337 // instruction that wants to use the result of the setcc.
22338 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22339 N->getOperand(0)->getOpcode() == ISD::SETCC);
22340 const SDValue SetCC = N->getOperand(0);
22341
22342 const SDValue CCOp0 = SetCC.getOperand(0);
22343 const SDValue CCOp1 = SetCC.getOperand(1);
22344 if (!CCOp0->getValueType(0).isInteger() ||
22345 !CCOp1->getValueType(0).isInteger())
22346 return SDValue();
22347
22348 ISD::CondCode Code =
22349 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
22350
22351 ISD::NodeType ExtType =
22352 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22353
22354 if (isCheapToExtend(SetCC.getOperand(0)) &&
22355 isCheapToExtend(SetCC.getOperand(1))) {
22356 const SDValue Ext1 =
22357 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
22358 const SDValue Ext2 =
22359 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
22360
22361 return DAG.getSetCC(
22362 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
22363 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
22364 }
22365
22366 return SDValue();
22367}
22368
22369// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22370// This comes from interleaved vectorization. It is performed late to capture
22371// uitofp converts too.
22373 SelectionDAG &DAG) {
22374 EVT VT = N->getValueType(0);
22375 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
22376 N->getOpcode() != ISD::ZERO_EXTEND ||
22377 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22378 return SDValue();
22379
22380 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
22381 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22382 return SDValue();
22383
22384 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
22385 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
22386 if (!Shuffle ||
22387 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
22388 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
22389 return SDValue();
22390
22391 unsigned Idx;
22393 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
22394 // An undef interleave shuffle can come up after other canonicalizations,
22395 // where the shuffle has been converted to
22396 // zext(extract(shuffle b, undef, [u,u,0,4]))
22397 bool IsUndefDeInterleave = false;
22398 if (!IsDeInterleave)
22399 IsUndefDeInterleave =
22400 Shuffle->getOperand(1).isUndef() &&
22402 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
22403 VT.getVectorNumElements() / 2),
22404 4, Idx);
22405 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
22406 return SDValue();
22407 SDLoc DL(N);
22408 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22409 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
22410 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22411 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
22413 VT, BC1, BC2);
22414 if ((Idx & 1) == 1)
22415 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
22416 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
22417 return DAG.getNode(
22418 ISD::AND, DL, VT, UZP,
22419 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
22420}
22421
22422// This comes up similar to the above when lowering deinterleaving shuffles from
22423// zexts. We have legalized the operations in the generally case to
22424// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22425// the extract is to the low half and the uzp is uzp1. There would be an extra
22426// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22427// there could also be an existing and / shift that can be combined in, either
22428// before of after the extract.
22430 EVT VT = N->getValueType(0);
22431 if (N->getOpcode() != ISD::ZERO_EXTEND ||
22432 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22433 return SDValue();
22434
22435 SDValue Op = N->getOperand(0);
22436 unsigned ExtOffset = (unsigned)-1;
22437 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22438 ExtOffset = Op.getConstantOperandVal(1);
22439 Op = Op.getOperand(0);
22440 }
22441
22442 unsigned Shift = 0;
22444 Op.getValueType().getScalarSizeInBits());
22445
22446 if (Op.getOpcode() == AArch64ISD::VLSHR) {
22447 Shift = Op.getConstantOperandVal(1);
22448 Op = Op.getOperand(0);
22449 Mask = Mask.lshr(Shift);
22450 }
22451 if (Op.getOpcode() == ISD::AND &&
22452 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
22453 Op = Op.getOperand(0);
22454 Mask = Mask.zext(VT.getScalarSizeInBits());
22455 } else if (Op.getOpcode() == AArch64ISD::BICi) {
22456 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
22457 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
22458 Mask = Mask.zext(VT.getScalarSizeInBits());
22459 Op = Op.getOperand(0);
22460 }
22461
22462 if (ExtOffset == (unsigned)-1) {
22463 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22464 ExtOffset = Op.getConstantOperandVal(1);
22465 Op = Op.getOperand(0);
22466 } else
22467 return SDValue();
22468 }
22469 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22470 return SDValue();
22471
22472 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
22473 return SDValue();
22474 if (Op.getOpcode() == AArch64ISD::UZP2)
22475 Shift += VT.getScalarSizeInBits() / 2;
22476
22477 SDLoc DL(N);
22478 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22479 Op.getOperand(ExtOffset == 0 ? 0 : 1));
22480 if (Shift != 0)
22481 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
22482 DAG.getConstant(Shift, DL, MVT::i32));
22483 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
22484}
22485
22488 SelectionDAG &DAG) {
22489 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
22490 // we can convert that DUP into another extract_high (of a bigger DUP), which
22491 // helps the backend to decide that an sabdl2 would be useful, saving a real
22492 // extract_high operation.
22493 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
22494 N->getOperand(0).getValueType().is64BitVector() &&
22495 (N->getOperand(0).getOpcode() == ISD::ABDU ||
22496 N->getOperand(0).getOpcode() == ISD::ABDS)) {
22497 SDNode *ABDNode = N->getOperand(0).getNode();
22498 SDValue NewABD =
22500 if (!NewABD.getNode())
22501 return SDValue();
22502
22503 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
22504 }
22505
22507 return R;
22508 if (SDValue R = performZExtUZPCombine(N, DAG))
22509 return R;
22510
22511 if (N->getValueType(0).isFixedLengthVector() &&
22512 N->getOpcode() == ISD::SIGN_EXTEND &&
22513 N->getOperand(0)->getOpcode() == ISD::SETCC)
22514 return performSignExtendSetCCCombine(N, DCI, DAG);
22515
22516 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
22517 // that the top half of the result register must be unused, due to the
22518 // any_extend. This means that we can replace this pattern with (rev16
22519 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
22520 // ...)), which is what this pattern would otherwise be lowered to.
22521 // Only apply this optimisation if any_extend in original pattern to i32 or
22522 // i64, because this type will become the input type to REV16 in the new
22523 // pattern, so must be a legitimate REV16 input type.
22524 SDValue Bswap = N->getOperand(0);
22525 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
22526 Bswap.getValueType() == MVT::i16 &&
22527 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
22528 SDLoc DL(N);
22529 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
22530 Bswap->getOperand(0));
22531 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
22532 NewAnyExtend);
22533 }
22534
22535 return SDValue();
22536}
22537
22539 SDValue SplatVal, unsigned NumVecElts) {
22540 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
22541 Align OrigAlignment = St.getAlign();
22542 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
22543
22544 // Create scalar stores. This is at least as good as the code sequence for a
22545 // split unaligned store which is a dup.s, ext.b, and two stores.
22546 // Most of the time the three stores should be replaced by store pair
22547 // instructions (stp).
22548 SDLoc DL(&St);
22549 SDValue BasePtr = St.getBasePtr();
22550 uint64_t BaseOffset = 0;
22551
22552 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
22553 SDValue NewST1 =
22554 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
22555 OrigAlignment, St.getMemOperand()->getFlags());
22556
22557 // As this in ISel, we will not merge this add which may degrade results.
22558 if (BasePtr->getOpcode() == ISD::ADD &&
22559 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
22560 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
22561 BasePtr = BasePtr->getOperand(0);
22562 }
22563
22564 unsigned Offset = EltOffset;
22565 while (--NumVecElts) {
22566 Align Alignment = commonAlignment(OrigAlignment, Offset);
22567 SDValue OffsetPtr =
22568 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
22569 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
22570 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
22571 PtrInfo.getWithOffset(Offset), Alignment,
22572 St.getMemOperand()->getFlags());
22573 Offset += EltOffset;
22574 }
22575 return NewST1;
22576}
22577
22578// Returns an SVE type that ContentTy can be trivially sign or zero extended
22579// into.
22580static MVT getSVEContainerType(EVT ContentTy) {
22581 assert(ContentTy.isSimple() && "No SVE containers for extended types");
22582
22583 switch (ContentTy.getSimpleVT().SimpleTy) {
22584 default:
22585 llvm_unreachable("No known SVE container for this MVT type");
22586 case MVT::nxv2i8:
22587 case MVT::nxv2i16:
22588 case MVT::nxv2i32:
22589 case MVT::nxv2i64:
22590 case MVT::nxv2f32:
22591 case MVT::nxv2f64:
22592 return MVT::nxv2i64;
22593 case MVT::nxv4i8:
22594 case MVT::nxv4i16:
22595 case MVT::nxv4i32:
22596 case MVT::nxv4f32:
22597 return MVT::nxv4i32;
22598 case MVT::nxv8i8:
22599 case MVT::nxv8i16:
22600 case MVT::nxv8f16:
22601 case MVT::nxv8bf16:
22602 return MVT::nxv8i16;
22603 case MVT::nxv16i8:
22604 return MVT::nxv16i8;
22605 }
22606}
22607
22608static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
22609 SDLoc DL(N);
22610 EVT VT = N->getValueType(0);
22611
22613 return SDValue();
22614
22615 EVT ContainerVT = VT;
22616 if (ContainerVT.isInteger())
22617 ContainerVT = getSVEContainerType(ContainerVT);
22618
22619 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
22620 SDValue Ops[] = { N->getOperand(0), // Chain
22621 N->getOperand(2), // Pg
22622 N->getOperand(3), // Base
22623 DAG.getValueType(VT) };
22624
22625 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
22626 SDValue LoadChain = SDValue(Load.getNode(), 1);
22627
22628 if (ContainerVT.isInteger() && (VT != ContainerVT))
22629 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
22630
22631 return DAG.getMergeValues({ Load, LoadChain }, DL);
22632}
22633
22635 SDLoc DL(N);
22636 EVT VT = N->getValueType(0);
22637 EVT PtrTy = N->getOperand(3).getValueType();
22638
22639 EVT LoadVT = VT;
22640 if (VT.isFloatingPoint())
22641 LoadVT = VT.changeTypeToInteger();
22642
22643 auto *MINode = cast<MemIntrinsicSDNode>(N);
22644 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
22645 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
22646 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
22647 MINode->getOperand(2), PassThru,
22648 MINode->getMemoryVT(), MINode->getMemOperand(),
22650
22651 if (VT.isFloatingPoint()) {
22652 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
22653 return DAG.getMergeValues(Ops, DL);
22654 }
22655
22656 return L;
22657}
22658
22659template <unsigned Opcode>
22661 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
22663 "Unsupported opcode.");
22664 SDLoc DL(N);
22665 EVT VT = N->getValueType(0);
22666
22667 EVT LoadVT = VT;
22668 if (VT.isFloatingPoint())
22669 LoadVT = VT.changeTypeToInteger();
22670
22671 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
22672 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
22673 SDValue LoadChain = SDValue(Load.getNode(), 1);
22674
22675 if (VT.isFloatingPoint())
22676 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
22677
22678 return DAG.getMergeValues({Load, LoadChain}, DL);
22679}
22680
22682 SDLoc DL(N);
22683 SDValue Data = N->getOperand(2);
22684 EVT DataVT = Data.getValueType();
22685 EVT HwSrcVt = getSVEContainerType(DataVT);
22686 SDValue InputVT = DAG.getValueType(DataVT);
22687
22688 if (DataVT.isFloatingPoint())
22689 InputVT = DAG.getValueType(HwSrcVt);
22690
22691 SDValue SrcNew;
22692 if (Data.getValueType().isFloatingPoint())
22693 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
22694 else
22695 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
22696
22697 SDValue Ops[] = { N->getOperand(0), // Chain
22698 SrcNew,
22699 N->getOperand(4), // Base
22700 N->getOperand(3), // Pg
22701 InputVT
22702 };
22703
22704 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
22705}
22706
22708 SDLoc DL(N);
22709
22710 SDValue Data = N->getOperand(2);
22711 EVT DataVT = Data.getValueType();
22712 EVT PtrTy = N->getOperand(4).getValueType();
22713
22714 if (DataVT.isFloatingPoint())
22716
22717 auto *MINode = cast<MemIntrinsicSDNode>(N);
22718 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
22719 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
22720 MINode->getMemoryVT(), MINode->getMemOperand(),
22721 ISD::UNINDEXED, false, false);
22722}
22723
22724/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
22725/// load store optimizer pass will merge them to store pair stores. This should
22726/// be better than a movi to create the vector zero followed by a vector store
22727/// if the zero constant is not re-used, since one instructions and one register
22728/// live range will be removed.
22729///
22730/// For example, the final generated code should be:
22731///
22732/// stp xzr, xzr, [x0]
22733///
22734/// instead of:
22735///
22736/// movi v0.2d, #0
22737/// str q0, [x0]
22738///
22740 SDValue StVal = St.getValue();
22741 EVT VT = StVal.getValueType();
22742
22743 // Avoid scalarizing zero splat stores for scalable vectors.
22744 if (VT.isScalableVector())
22745 return SDValue();
22746
22747 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
22748 // 2, 3 or 4 i32 elements.
22749 int NumVecElts = VT.getVectorNumElements();
22750 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
22751 VT.getVectorElementType().getSizeInBits() == 64) ||
22752 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
22753 VT.getVectorElementType().getSizeInBits() == 32)))
22754 return SDValue();
22755
22756 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
22757 return SDValue();
22758
22759 // If the zero constant has more than one use then the vector store could be
22760 // better since the constant mov will be amortized and stp q instructions
22761 // should be able to be formed.
22762 if (!StVal.hasOneUse())
22763 return SDValue();
22764
22765 // If the store is truncating then it's going down to i16 or smaller, which
22766 // means it can be implemented in a single store anyway.
22767 if (St.isTruncatingStore())
22768 return SDValue();
22769
22770 // If the immediate offset of the address operand is too large for the stp
22771 // instruction, then bail out.
22772 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
22773 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
22774 if (Offset < -512 || Offset > 504)
22775 return SDValue();
22776 }
22777
22778 for (int I = 0; I < NumVecElts; ++I) {
22779 SDValue EltVal = StVal.getOperand(I);
22780 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
22781 return SDValue();
22782 }
22783
22784 // Use a CopyFromReg WZR/XZR here to prevent
22785 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
22786 SDLoc DL(&St);
22787 unsigned ZeroReg;
22788 EVT ZeroVT;
22789 if (VT.getVectorElementType().getSizeInBits() == 32) {
22790 ZeroReg = AArch64::WZR;
22791 ZeroVT = MVT::i32;
22792 } else {
22793 ZeroReg = AArch64::XZR;
22794 ZeroVT = MVT::i64;
22795 }
22796 SDValue SplatVal =
22797 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
22798 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22799}
22800
22801/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
22802/// value. The load store optimizer pass will merge them to store pair stores.
22803/// This has better performance than a splat of the scalar followed by a split
22804/// vector store. Even if the stores are not merged it is four stores vs a dup,
22805/// followed by an ext.b and two stores.
22807 SDValue StVal = St.getValue();
22808 EVT VT = StVal.getValueType();
22809
22810 // Don't replace floating point stores, they possibly won't be transformed to
22811 // stp because of the store pair suppress pass.
22812 if (VT.isFloatingPoint())
22813 return SDValue();
22814
22815 // We can express a splat as store pair(s) for 2 or 4 elements.
22816 unsigned NumVecElts = VT.getVectorNumElements();
22817 if (NumVecElts != 4 && NumVecElts != 2)
22818 return SDValue();
22819
22820 // If the store is truncating then it's going down to i16 or smaller, which
22821 // means it can be implemented in a single store anyway.
22822 if (St.isTruncatingStore())
22823 return SDValue();
22824
22825 // Check that this is a splat.
22826 // Make sure that each of the relevant vector element locations are inserted
22827 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
22828 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
22829 SDValue SplatVal;
22830 for (unsigned I = 0; I < NumVecElts; ++I) {
22831 // Check for insert vector elements.
22832 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
22833 return SDValue();
22834
22835 // Check that same value is inserted at each vector element.
22836 if (I == 0)
22837 SplatVal = StVal.getOperand(1);
22838 else if (StVal.getOperand(1) != SplatVal)
22839 return SDValue();
22840
22841 // Check insert element index.
22842 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
22843 if (!CIndex)
22844 return SDValue();
22845 uint64_t IndexVal = CIndex->getZExtValue();
22846 if (IndexVal >= NumVecElts)
22847 return SDValue();
22848 IndexNotInserted.reset(IndexVal);
22849
22850 StVal = StVal.getOperand(0);
22851 }
22852 // Check that all vector element locations were inserted to.
22853 if (IndexNotInserted.any())
22854 return SDValue();
22855
22856 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22857}
22858
22860 SelectionDAG &DAG,
22861 const AArch64Subtarget *Subtarget) {
22862
22863 StoreSDNode *S = cast<StoreSDNode>(N);
22864 if (S->isVolatile() || S->isIndexed())
22865 return SDValue();
22866
22867 SDValue StVal = S->getValue();
22868 EVT VT = StVal.getValueType();
22869
22870 if (!VT.isFixedLengthVector())
22871 return SDValue();
22872
22873 // If we get a splat of zeros, convert this vector store to a store of
22874 // scalars. They will be merged into store pairs of xzr thereby removing one
22875 // instruction and one register.
22876 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
22877 return ReplacedZeroSplat;
22878
22879 // FIXME: The logic for deciding if an unaligned store should be split should
22880 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
22881 // a call to that function here.
22882
22883 if (!Subtarget->isMisaligned128StoreSlow())
22884 return SDValue();
22885
22886 // Don't split at -Oz.
22888 return SDValue();
22889
22890 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
22891 // those up regresses performance on micro-benchmarks and olden/bh.
22892 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
22893 return SDValue();
22894
22895 // Split unaligned 16B stores. They are terrible for performance.
22896 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
22897 // extensions can use this to mark that it does not want splitting to happen
22898 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
22899 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
22900 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
22901 S->getAlign() <= Align(2))
22902 return SDValue();
22903
22904 // If we get a splat of a scalar convert this vector store to a store of
22905 // scalars. They will be merged into store pairs thereby removing two
22906 // instructions.
22907 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
22908 return ReplacedSplat;
22909
22910 SDLoc DL(S);
22911
22912 // Split VT into two.
22913 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
22914 unsigned NumElts = HalfVT.getVectorNumElements();
22915 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
22916 DAG.getConstant(0, DL, MVT::i64));
22917 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
22918 DAG.getConstant(NumElts, DL, MVT::i64));
22919 SDValue BasePtr = S->getBasePtr();
22920 SDValue NewST1 =
22921 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
22922 S->getAlign(), S->getMemOperand()->getFlags());
22923 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
22924 DAG.getConstant(8, DL, MVT::i64));
22925 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
22926 S->getPointerInfo(), S->getAlign(),
22927 S->getMemOperand()->getFlags());
22928}
22929
22931 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
22932
22933 // splice(pg, op1, undef) -> op1
22934 if (N->getOperand(2).isUndef())
22935 return N->getOperand(1);
22936
22937 return SDValue();
22938}
22939
22941 const AArch64Subtarget *Subtarget) {
22942 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
22943 N->getOpcode() == AArch64ISD::UUNPKLO) &&
22944 "Unexpected Opcode!");
22945
22946 // uunpklo/hi undef -> undef
22947 if (N->getOperand(0).isUndef())
22948 return DAG.getUNDEF(N->getValueType(0));
22949
22950 // If this is a masked load followed by an UUNPKLO, fold this into a masked
22951 // extending load. We can do this even if this is already a masked
22952 // {z,}extload.
22953 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
22954 N->getOpcode() == AArch64ISD::UUNPKLO) {
22955 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
22956 SDValue Mask = MLD->getMask();
22957 SDLoc DL(N);
22958
22959 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22960 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22961 (MLD->getPassThru()->isUndef() ||
22962 isZerosVector(MLD->getPassThru().getNode()))) {
22963 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22964 unsigned PgPattern = Mask->getConstantOperandVal(0);
22965 EVT VT = N->getValueType(0);
22966
22967 // Ensure we can double the size of the predicate pattern
22968 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22969 if (NumElts &&
22970 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22971 Mask =
22972 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
22973 SDValue PassThru = DAG.getConstant(0, DL, VT);
22974 SDValue NewLoad = DAG.getMaskedLoad(
22975 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
22976 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
22978
22979 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
22980
22981 return NewLoad;
22982 }
22983 }
22984 }
22985
22986 return SDValue();
22987}
22988
22990 if (N->getOpcode() != AArch64ISD::UZP1)
22991 return false;
22992 SDValue Op0 = N->getOperand(0);
22993 EVT SrcVT = Op0->getValueType(0);
22994 EVT DstVT = N->getValueType(0);
22995 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
22996 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
22997 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
22998}
22999
23000// Try to combine rounding shifts where the operands come from an extend, and
23001// the result is truncated and combined into one vector.
23002// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23004 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23005 SDValue Op0 = N->getOperand(0);
23006 SDValue Op1 = N->getOperand(1);
23007 EVT ResVT = N->getValueType(0);
23008
23009 unsigned RshOpc = Op0.getOpcode();
23010 if (RshOpc != AArch64ISD::RSHRNB_I)
23011 return SDValue();
23012
23013 // Same op code and imm value?
23014 SDValue ShiftValue = Op0.getOperand(1);
23015 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
23016 return SDValue();
23017
23018 // Same unextended operand value?
23019 SDValue Lo = Op0.getOperand(0);
23020 SDValue Hi = Op1.getOperand(0);
23021 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23022 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23023 return SDValue();
23024 SDValue OrigArg = Lo.getOperand(0);
23025 if (OrigArg != Hi.getOperand(0))
23026 return SDValue();
23027
23028 SDLoc DL(N);
23029 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
23030 getPredicateForVector(DAG, DL, ResVT), OrigArg,
23031 ShiftValue);
23032}
23033
23034// Try to simplify:
23035// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23036// t2 = nxv8i16 srl(t1, ShiftValue)
23037// to
23038// t1 = nxv8i16 rshrnb(X, shiftvalue).
23039// rshrnb will zero the top half bits of each element. Therefore, this combine
23040// should only be performed when a following instruction with the rshrnb
23041// as an operand does not care about the top half of each element. For example,
23042// a uzp1 or a truncating store.
23044 const AArch64Subtarget *Subtarget) {
23045 EVT VT = Srl->getValueType(0);
23046 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23047 return SDValue();
23048
23049 EVT ResVT;
23050 if (VT == MVT::nxv8i16)
23051 ResVT = MVT::nxv16i8;
23052 else if (VT == MVT::nxv4i32)
23053 ResVT = MVT::nxv8i16;
23054 else if (VT == MVT::nxv2i64)
23055 ResVT = MVT::nxv4i32;
23056 else
23057 return SDValue();
23058
23059 SDLoc DL(Srl);
23060 unsigned ShiftValue;
23061 SDValue RShOperand;
23062 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23063 return SDValue();
23064 SDValue Rshrnb = DAG.getNode(
23065 AArch64ISD::RSHRNB_I, DL, ResVT,
23066 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23067 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23068}
23069
23071 if (V.getOpcode() != AArch64ISD::NVCAST)
23072 return SDValue();
23073
23074 SDValue Op = V.getOperand(0);
23075 if (V.getValueType().getVectorElementCount() !=
23076 Op.getValueType().getVectorElementCount() * 2)
23077 return SDValue();
23078
23079 return Op;
23080}
23081
23083 const AArch64Subtarget *Subtarget) {
23084 SDLoc DL(N);
23085 SDValue Op0 = N->getOperand(0);
23086 SDValue Op1 = N->getOperand(1);
23087 EVT ResVT = N->getValueType(0);
23088
23089 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23090 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23092 Op0.getOperand(0) == Op1.getOperand(0)) {
23093
23094 SDValue SourceVec = Op0.getOperand(0);
23095 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23096 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23097 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23098 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23099 EVT OpVT = Op0.getOperand(1).getValueType();
23100 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23101 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23102 DAG.getUNDEF(WidenedResVT));
23103 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23104 DAG.getConstant(0, DL, OpVT));
23105 }
23106 }
23107
23108 // Following optimizations only work with uzp1.
23109 if (N->getOpcode() == AArch64ISD::UZP2)
23110 return SDValue();
23111
23112 // uzp1(x, undef) -> concat(truncate(x), undef)
23113 if (Op1.getOpcode() == ISD::UNDEF) {
23114 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23115 switch (ResVT.getSimpleVT().SimpleTy) {
23116 default:
23117 break;
23118 case MVT::v16i8:
23119 BCVT = MVT::v8i16;
23120 HalfVT = MVT::v8i8;
23121 break;
23122 case MVT::v8i16:
23123 BCVT = MVT::v4i32;
23124 HalfVT = MVT::v4i16;
23125 break;
23126 case MVT::v4i32:
23127 BCVT = MVT::v2i64;
23128 HalfVT = MVT::v2i32;
23129 break;
23130 }
23131 if (BCVT != MVT::Other) {
23132 SDValue BC = DAG.getBitcast(BCVT, Op0);
23133 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23134 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23135 DAG.getUNDEF(HalfVT));
23136 }
23137 }
23138
23139 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23140 return Urshr;
23141
23142 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23143 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23144 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23145 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23146 }
23147 }
23148
23149 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23150 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23151 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23152 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23153 }
23154 }
23155
23156 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23157 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23158 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23159 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23160 SDValue X = PreCast.getOperand(0).getOperand(0);
23161 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23162 }
23163 }
23164 }
23165
23166 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23167 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23168 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23169 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23170 SDValue Z = PreCast.getOperand(0).getOperand(1);
23171 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23172 }
23173 }
23174 }
23175
23176 // These optimizations only work on little endian.
23177 if (!DAG.getDataLayout().isLittleEndian())
23178 return SDValue();
23179
23180 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23181 // Example:
23182 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23183 // to
23184 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23186 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23187 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23188 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23189 Op1.getOperand(0));
23190 }
23191 }
23192
23193 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23194 return SDValue();
23195
23196 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23197 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23198
23199 // truncating uzp1(x, y) -> xtn(concat (x, y))
23200 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23201 EVT Op0Ty = SourceOp0.getValueType();
23202 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23203 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23204 SDValue Concat =
23207 SourceOp0, SourceOp1);
23208 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23209 }
23210 }
23211
23212 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23213 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23214 SourceOp1.getOpcode() != ISD::TRUNCATE)
23215 return SDValue();
23216 SourceOp0 = SourceOp0.getOperand(0);
23217 SourceOp1 = SourceOp1.getOperand(0);
23218
23219 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23220 !SourceOp0.getValueType().isSimple())
23221 return SDValue();
23222
23223 EVT ResultTy;
23224
23225 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23226 case MVT::v2i64:
23227 ResultTy = MVT::v4i32;
23228 break;
23229 case MVT::v4i32:
23230 ResultTy = MVT::v8i16;
23231 break;
23232 case MVT::v8i16:
23233 ResultTy = MVT::v16i8;
23234 break;
23235 default:
23236 return SDValue();
23237 }
23238
23239 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23240 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23241 SDValue UzpResult =
23242 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23243
23244 EVT BitcastResultTy;
23245
23246 switch (ResVT.getSimpleVT().SimpleTy) {
23247 case MVT::v2i32:
23248 BitcastResultTy = MVT::v2i64;
23249 break;
23250 case MVT::v4i16:
23251 BitcastResultTy = MVT::v4i32;
23252 break;
23253 case MVT::v8i8:
23254 BitcastResultTy = MVT::v8i16;
23255 break;
23256 default:
23257 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23258 }
23259
23260 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23261 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23262}
23263
23265 unsigned Opc = N->getOpcode();
23266
23267 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
23269 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
23271 "Invalid opcode.");
23272
23273 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23275 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23277 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23281
23282 SDLoc DL(N);
23283 SDValue Chain = N->getOperand(0);
23284 SDValue Pg = N->getOperand(1);
23285 SDValue Base = N->getOperand(2);
23286 SDValue Offset = N->getOperand(3);
23287 SDValue Ty = N->getOperand(4);
23288
23289 EVT ResVT = N->getValueType(0);
23290
23291 const auto OffsetOpc = Offset.getOpcode();
23292 const bool OffsetIsZExt =
23294 const bool OffsetIsSExt =
23296
23297 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23298 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23299 SDValue ExtPg = Offset.getOperand(0);
23300 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
23301 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23302
23303 // If the predicate for the sign- or zero-extended offset is the
23304 // same as the predicate used for this load and the sign-/zero-extension
23305 // was from a 32-bits...
23306 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23307 SDValue UnextendedOffset = Offset.getOperand(1);
23308
23309 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
23310 if (Signed)
23311 NewOpc = getSignExtendedGatherOpcode(NewOpc);
23312
23313 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
23314 {Chain, Pg, Base, UnextendedOffset, Ty});
23315 }
23316 }
23317
23318 return SDValue();
23319}
23320
23321/// Optimize a vector shift instruction and its operand if shifted out
23322/// bits are not used.
23324 const AArch64TargetLowering &TLI,
23326 assert(N->getOpcode() == AArch64ISD::VASHR ||
23327 N->getOpcode() == AArch64ISD::VLSHR);
23328
23329 SDValue Op = N->getOperand(0);
23330 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23331
23332 unsigned ShiftImm = N->getConstantOperandVal(1);
23333 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23334
23335 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23336 if (N->getOpcode() == AArch64ISD::VASHR &&
23337 Op.getOpcode() == AArch64ISD::VSHL &&
23338 N->getOperand(1) == Op.getOperand(1))
23339 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
23340 return Op.getOperand(0);
23341
23342 // If the shift is exact, the shifted out bits matter.
23343 if (N->getFlags().hasExact())
23344 return SDValue();
23345
23346 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
23347 APInt DemandedMask = ~ShiftedOutBits;
23348
23349 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
23350 return SDValue(N, 0);
23351
23352 return SDValue();
23353}
23354
23356 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23357 // This transform works in partnership with performSetCCPunpkCombine to
23358 // remove unnecessary transfer of predicates into standard registers and back
23359 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
23360 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
23361 MVT::i1) {
23362 SDValue CC = N->getOperand(0)->getOperand(0);
23363 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
23364 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
23365 DAG.getVectorIdxConstant(0, SDLoc(N)));
23366 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
23367 }
23368
23369 return SDValue();
23370}
23371
23372/// Target-specific DAG combine function for post-increment LD1 (lane) and
23373/// post-increment LD1R.
23376 bool IsLaneOp) {
23377 if (DCI.isBeforeLegalizeOps())
23378 return SDValue();
23379
23380 SelectionDAG &DAG = DCI.DAG;
23381 EVT VT = N->getValueType(0);
23382
23383 if (!VT.is128BitVector() && !VT.is64BitVector())
23384 return SDValue();
23385
23386 // If it is not LOAD, can not do such combine.
23387 unsigned LoadIdx = IsLaneOp ? 1 : 0;
23388 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
23389 if (!LD)
23390 return SDValue();
23391
23392 // If the Generic combiner already helped form a pre- or post-indexed load,
23393 // skip forming one here.
23394 if (LD->isIndexed())
23395 return SDValue();
23396
23397 // The vector lane must be a constant in the LD1LANE opcode.
23398 SDValue Lane;
23399 if (IsLaneOp) {
23400 Lane = N->getOperand(2);
23401 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
23402 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
23403 return SDValue();
23404 }
23405
23406 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
23407 EVT MemVT = LoadSDN->getMemoryVT();
23408 // Check if memory operand is the same type as the vector element.
23409 if (MemVT != VT.getVectorElementType())
23410 return SDValue();
23411
23412 // Check if there are other uses. If so, do not combine as it will introduce
23413 // an extra load.
23414 for (SDUse &U : LD->uses()) {
23415 if (U.getResNo() == 1) // Ignore uses of the chain result.
23416 continue;
23417 if (U.getUser() != N)
23418 return SDValue();
23419 }
23420
23421 // If there is one use and it can splat the value, prefer that operation.
23422 // TODO: This could be expanded to more operations if they reliably use the
23423 // index variants.
23424 if (N->hasOneUse()) {
23425 unsigned UseOpc = N->user_begin()->getOpcode();
23426 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
23427 return SDValue();
23428 }
23429
23430 SDValue Addr = LD->getOperand(1);
23431 SDValue Vector = N->getOperand(0);
23432 // Search for a use of the address operand that is an increment.
23433 for (SDUse &Use : Addr->uses()) {
23434 SDNode *User = Use.getUser();
23435 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
23436 continue;
23437
23438 // If the increment is a constant, it must match the memory ref size.
23439 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23440 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
23441 uint32_t IncVal = CInc->getZExtValue();
23442 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
23443 if (IncVal != NumBytes)
23444 continue;
23445 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
23446 }
23447
23448 // To avoid cycle construction make sure that neither the load nor the add
23449 // are predecessors to each other or the Vector.
23452 Visited.insert(Addr.getNode());
23453 Worklist.push_back(User);
23454 Worklist.push_back(LD);
23455 Worklist.push_back(Vector.getNode());
23456 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
23457 SDNode::hasPredecessorHelper(User, Visited, Worklist))
23458 continue;
23459
23461 Ops.push_back(LD->getOperand(0)); // Chain
23462 if (IsLaneOp) {
23463 Ops.push_back(Vector); // The vector to be inserted
23464 Ops.push_back(Lane); // The lane to be inserted in the vector
23465 }
23466 Ops.push_back(Addr);
23467 Ops.push_back(Inc);
23468
23469 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
23470 SDVTList SDTys = DAG.getVTList(Tys);
23471 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
23472 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
23473 MemVT,
23474 LoadSDN->getMemOperand());
23475
23476 // Update the uses.
23477 SDValue NewResults[] = {
23478 SDValue(LD, 0), // The result of load
23479 SDValue(UpdN.getNode(), 2) // Chain
23480 };
23481 DCI.CombineTo(LD, NewResults);
23482 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
23483 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
23484
23485 break;
23486 }
23487 return SDValue();
23488}
23489
23490/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
23491/// address translation.
23494 SelectionDAG &DAG) {
23495 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
23496 KnownBits Known;
23498 !DCI.isBeforeLegalizeOps());
23499 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23500 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
23501 DCI.CommitTargetLoweringOpt(TLO);
23502 return true;
23503 }
23504 return false;
23505}
23506
23508 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
23509 "Expected STORE dag node in input!");
23510
23511 if (auto Store = dyn_cast<StoreSDNode>(N)) {
23512 if (!Store->isTruncatingStore() || Store->isIndexed())
23513 return SDValue();
23514 SDValue Ext = Store->getValue();
23515 auto ExtOpCode = Ext.getOpcode();
23516 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
23517 ExtOpCode != ISD::ANY_EXTEND)
23518 return SDValue();
23519 SDValue Orig = Ext->getOperand(0);
23520 if (Store->getMemoryVT() != Orig.getValueType())
23521 return SDValue();
23522 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
23523 Store->getBasePtr(), Store->getMemOperand());
23524 }
23525
23526 return SDValue();
23527}
23528
23529// A custom combine to lower load <3 x i8> as the more efficient sequence
23530// below:
23531// ldrb wX, [x0, #2]
23532// ldrh wY, [x0]
23533// orr wX, wY, wX, lsl #16
23534// fmov s0, wX
23535//
23536// Note that an alternative sequence with even fewer (although usually more
23537// complex/expensive) instructions would be:
23538// ld1r.4h { v0 }, [x0], #2
23539// ld1.b { v0 }[2], [x0]
23540//
23541// Generating this sequence unfortunately results in noticeably worse codegen
23542// for code that extends the loaded v3i8, due to legalization breaking vector
23543// shuffle detection in a way that is very difficult to work around.
23544// TODO: Revisit once v3i8 legalization has been improved in general.
23546 EVT MemVT = LD->getMemoryVT();
23547 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
23548 LD->getOriginalAlign() >= 4)
23549 return SDValue();
23550
23551 SDLoc DL(LD);
23553 SDValue Chain = LD->getChain();
23554 SDValue BasePtr = LD->getBasePtr();
23555 MachineMemOperand *MMO = LD->getMemOperand();
23556 assert(LD->getOffset().isUndef() && "undef offset expected");
23557
23558 // Load 2 x i8, then 1 x i8.
23559 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
23560 TypeSize Offset2 = TypeSize::getFixed(2);
23561 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
23562 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
23563 MF.getMachineMemOperand(MMO, 2, 1));
23564
23565 // Extend to i32.
23566 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
23567 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
23568
23569 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
23570 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
23571 DAG.getConstant(16, DL, MVT::i32));
23572 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
23573 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
23574
23575 // Extract v3i8 again.
23576 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
23577 DAG.getConstant(0, DL, MVT::i64));
23578 SDValue TokenFactor = DAG.getNode(
23579 ISD::TokenFactor, DL, MVT::Other,
23580 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
23581 return DAG.getMergeValues({Extract, TokenFactor}, DL);
23582}
23583
23584// Perform TBI simplification if supported by the target and try to break up
23585// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
23586// load instructions can be selected.
23589 SelectionDAG &DAG,
23590 const AArch64Subtarget *Subtarget) {
23591 if (Subtarget->supportsAddressTopByteIgnored())
23592 performTBISimplification(N->getOperand(1), DCI, DAG);
23593
23594 LoadSDNode *LD = cast<LoadSDNode>(N);
23595 EVT RegVT = LD->getValueType(0);
23596 EVT MemVT = LD->getMemoryVT();
23597 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23598 SDLoc DL(LD);
23599
23600 // Cast ptr32 and ptr64 pointers to the default address space before a load.
23601 unsigned AddrSpace = LD->getAddressSpace();
23602 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
23603 AddrSpace == ARM64AS::PTR32_UPTR) {
23604 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
23605 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
23606 SDValue Cast =
23607 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
23608 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
23609 Cast, LD->getPointerInfo(), MemVT,
23610 LD->getOriginalAlign(),
23611 LD->getMemOperand()->getFlags());
23612 }
23613 }
23614
23615 if (LD->isVolatile() || !Subtarget->isLittleEndian())
23616 return SDValue(N, 0);
23617
23618 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
23619 return Res;
23620
23621 if (!LD->isNonTemporal())
23622 return SDValue(N, 0);
23623
23624 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
23625 MemVT.getSizeInBits() % 256 == 0 ||
23626 256 % MemVT.getScalarSizeInBits() != 0)
23627 return SDValue(N, 0);
23628
23629 SDValue Chain = LD->getChain();
23630 SDValue BasePtr = LD->getBasePtr();
23631 SDNodeFlags Flags = LD->getFlags();
23633 SmallVector<SDValue, 4> LoadOpsChain;
23634 // Replace any non temporal load over 256-bit with a series of 256 bit loads
23635 // and a scalar/vector load less than 256. This way we can utilize 256-bit
23636 // loads and reduce the amount of load instructions generated.
23637 MVT NewVT =
23639 256 / MemVT.getVectorElementType().getSizeInBits());
23640 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
23641 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
23642 for (unsigned I = 0; I < Num256Loads; I++) {
23643 unsigned PtrOffset = I * 32;
23644 SDValue NewPtr = DAG.getMemBasePlusOffset(
23645 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
23646 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
23647 SDValue NewLoad = DAG.getLoad(
23648 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
23649 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
23650 LoadOps.push_back(NewLoad);
23651 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
23652 }
23653
23654 // Process remaining bits of the load operation.
23655 // This is done by creating an UNDEF vector to match the size of the
23656 // 256-bit loads and inserting the remaining load to it. We extract the
23657 // original load type at the end using EXTRACT_SUBVECTOR instruction.
23658 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
23659 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
23660 MVT RemainingVT = MVT::getVectorVT(
23662 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
23663 SDValue NewPtr = DAG.getMemBasePlusOffset(
23664 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
23665 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
23666 SDValue RemainingLoad =
23667 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
23668 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
23669 LD->getMemOperand()->getFlags(), LD->getAAInfo());
23670 SDValue UndefVector = DAG.getUNDEF(NewVT);
23671 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
23672 SDValue ExtendedReminingLoad =
23673 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
23674 {UndefVector, RemainingLoad, InsertIdx});
23675 LoadOps.push_back(ExtendedReminingLoad);
23676 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
23677 EVT ConcatVT =
23679 LoadOps.size() * NewVT.getVectorNumElements());
23680 SDValue ConcatVectors =
23681 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
23682 // Extract the original vector type size.
23683 SDValue ExtractSubVector =
23684 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
23685 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
23686 SDValue TokenFactor =
23687 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
23688 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
23689}
23690
23692 EVT VecVT = Op.getValueType();
23693 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
23694 "Need boolean vector type.");
23695
23696 if (Depth > 3)
23698
23699 // We can get the base type from a vector compare or truncate.
23700 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
23701 return Op.getOperand(0).getValueType();
23702
23703 // If an operand is a bool vector, continue looking.
23705 for (SDValue Operand : Op->op_values()) {
23706 if (Operand.getValueType() != VecVT)
23707 continue;
23708
23709 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
23710 if (!BaseVT.isSimple())
23711 BaseVT = OperandVT;
23712 else if (OperandVT != BaseVT)
23714 }
23715
23716 return BaseVT;
23717}
23718
23719// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
23720// iN, we can use a trick that extracts the i^th bit from the i^th element and
23721// then performs a vector add to get a scalar bitmask. This requires that each
23722// element's bits are either all 1 or all 0.
23724 SDLoc DL(N);
23725 SDValue ComparisonResult(N, 0);
23726 EVT VecVT = ComparisonResult.getValueType();
23727 assert(VecVT.isVector() && "Must be a vector type");
23728
23729 unsigned NumElts = VecVT.getVectorNumElements();
23730 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
23731 return SDValue();
23732
23733 if (VecVT.getVectorElementType() != MVT::i1 &&
23734 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
23735 return SDValue();
23736
23737 // If we can find the original types to work on instead of a vector of i1,
23738 // we can avoid extend/extract conversion instructions.
23739 if (VecVT.getVectorElementType() == MVT::i1) {
23740 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
23741 if (!VecVT.isSimple()) {
23742 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
23743 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
23744 }
23745 }
23746 VecVT = VecVT.changeVectorElementTypeToInteger();
23747
23748 // Large vectors don't map directly to this conversion, so to avoid too many
23749 // edge cases, we don't apply it here. The conversion will likely still be
23750 // applied later via multiple smaller vectors, whose results are concatenated.
23751 if (VecVT.getSizeInBits() > 128)
23752 return SDValue();
23753
23754 // Ensure that all elements' bits are either 0s or 1s.
23755 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
23756
23757 SmallVector<SDValue, 16> MaskConstants;
23759 VecVT == MVT::v16i8) {
23760 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
23761 // per entry. We split it into two halves, apply the mask, zip the halves to
23762 // create 8x 16-bit values, and the perform the vector reduce.
23763 for (unsigned Half = 0; Half < 2; ++Half) {
23764 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
23765 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
23766 }
23767 }
23768 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
23769 SDValue RepresentativeBits =
23770 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
23771
23772 SDValue UpperRepresentativeBits =
23773 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
23774 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
23775 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
23776 RepresentativeBits, UpperRepresentativeBits);
23777 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
23778 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
23779 }
23780
23781 // All other vector sizes.
23782 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
23783 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
23784 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
23785 }
23786
23787 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
23788 SDValue RepresentativeBits =
23789 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
23790 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
23791 NumElts, VecVT.getVectorElementType().getSizeInBits()));
23792 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
23793}
23794
23796 StoreSDNode *Store) {
23797 if (!Store->isTruncatingStore())
23798 return SDValue();
23799
23800 SDLoc DL(Store);
23801 SDValue VecOp = Store->getValue();
23802 EVT VT = VecOp.getValueType();
23803 EVT MemVT = Store->getMemoryVT();
23804
23805 if (!MemVT.isVector() || !VT.isVector() ||
23806 MemVT.getVectorElementType() != MVT::i1)
23807 return SDValue();
23808
23809 // If we are storing a vector that we are currently building, let
23810 // `scalarizeVectorStore()` handle this more efficiently.
23811 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
23812 return SDValue();
23813
23814 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
23815 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
23816 if (!VectorBits)
23817 return SDValue();
23818
23819 EVT StoreVT =
23821 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
23822 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
23823 Store->getMemOperand());
23824}
23825
23827 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
23828 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
23829 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
23830}
23831
23832// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
23834 const AArch64Subtarget *Subtarget) {
23835 SDValue Value = ST->getValue();
23836 EVT ValueVT = Value.getValueType();
23837
23838 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23839 Value.getOpcode() != ISD::TRUNCATE ||
23840 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
23841 return SDValue();
23842
23843 assert(ST->getOffset().isUndef() && "undef offset expected");
23844 SDLoc DL(ST);
23845 auto WideVT = EVT::getVectorVT(
23846 *DAG.getContext(),
23847 Value->getOperand(0).getValueType().getVectorElementType(), 4);
23848 SDValue UndefVector = DAG.getUNDEF(WideVT);
23849 SDValue WideTrunc = DAG.getNode(
23850 ISD::INSERT_SUBVECTOR, DL, WideVT,
23851 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
23852 SDValue Cast = DAG.getNode(
23853 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
23854 WideTrunc);
23855
23857 SDValue Chain = ST->getChain();
23858 MachineMemOperand *MMO = ST->getMemOperand();
23859 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
23860 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23861 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
23862 TypeSize Offset2 = TypeSize::getFixed(2);
23863 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
23864 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
23865
23866 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23867 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
23868 TypeSize Offset1 = TypeSize::getFixed(1);
23869 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
23870 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
23871
23872 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23873 DAG.getConstant(0, DL, MVT::i64));
23874 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
23875 MF.getMachineMemOperand(MMO, 0, 1));
23876 return Chain;
23877}
23878
23881 SelectionDAG &DAG,
23882 const AArch64Subtarget *Subtarget) {
23883 StoreSDNode *ST = cast<StoreSDNode>(N);
23884 SDValue Chain = ST->getChain();
23885 SDValue Value = ST->getValue();
23886 SDValue Ptr = ST->getBasePtr();
23887 EVT ValueVT = Value.getValueType();
23888 EVT MemVT = ST->getMemoryVT();
23889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23890 SDLoc DL(ST);
23891
23892 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
23893 EVT EltVT = VT.getVectorElementType();
23894 return EltVT == MVT::f32 || EltVT == MVT::f64;
23895 };
23896
23897 // Cast ptr32 and ptr64 pointers to the default address space before a store.
23898 unsigned AddrSpace = ST->getAddressSpace();
23899 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
23900 AddrSpace == ARM64AS::PTR32_UPTR) {
23901 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
23902 if (PtrVT != Ptr.getSimpleValueType()) {
23903 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
23904 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
23905 ST->getOriginalAlign(),
23906 ST->getMemOperand()->getFlags(), ST->getAAInfo());
23907 }
23908 }
23909
23910 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
23911 return Res;
23912
23913 // If this is an FP_ROUND followed by a store, fold this into a truncating
23914 // store. We can do this even if this is already a truncstore.
23915 // We purposefully don't care about legality of the nodes here as we know
23916 // they can be split down into something legal.
23917 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
23918 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
23919 Subtarget->useSVEForFixedLengthVectors() &&
23920 ValueVT.isFixedLengthVector() &&
23921 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
23922 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
23923 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
23924 ST->getMemOperand());
23925
23926 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
23927 return Split;
23928
23929 if (Subtarget->supportsAddressTopByteIgnored() &&
23930 performTBISimplification(N->getOperand(2), DCI, DAG))
23931 return SDValue(N, 0);
23932
23933 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
23934 return Store;
23935
23936 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
23937 return Store;
23938
23939 if (ST->isTruncatingStore()) {
23940 EVT StoreVT = ST->getMemoryVT();
23941 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
23942 return SDValue();
23943 if (SDValue Rshrnb =
23944 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
23945 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
23946 StoreVT, ST->getMemOperand());
23947 }
23948 }
23949
23950 return SDValue();
23951}
23952
23955 SelectionDAG &DAG,
23956 const AArch64Subtarget *Subtarget) {
23957 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
23958 SDValue Value = MST->getValue();
23959 SDValue Mask = MST->getMask();
23960 SDLoc DL(N);
23961
23962 // If this is a UZP1 followed by a masked store, fold this into a masked
23963 // truncating store. We can do this even if this is already a masked
23964 // truncstore.
23965 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
23966 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23967 Value.getValueType().isInteger()) {
23968 Value = Value.getOperand(0);
23969 if (Value.getOpcode() == ISD::BITCAST) {
23970 EVT HalfVT =
23971 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
23972 EVT InVT = Value.getOperand(0).getValueType();
23973
23974 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
23975 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23976 unsigned PgPattern = Mask->getConstantOperandVal(0);
23977
23978 // Ensure we can double the size of the predicate pattern
23979 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23980 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
23981 MinSVESize) {
23982 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
23983 PgPattern);
23984 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
23985 MST->getBasePtr(), MST->getOffset(), Mask,
23986 MST->getMemoryVT(), MST->getMemOperand(),
23987 MST->getAddressingMode(),
23988 /*IsTruncating=*/true);
23989 }
23990 }
23991 }
23992 }
23993
23994 if (MST->isTruncatingStore()) {
23995 EVT ValueVT = Value->getValueType(0);
23996 EVT MemVT = MST->getMemoryVT();
23997 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
23998 return SDValue();
23999 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
24000 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
24001 MST->getOffset(), MST->getMask(),
24002 MST->getMemoryVT(), MST->getMemOperand(),
24003 MST->getAddressingMode(), true);
24004 }
24005 }
24006
24007 return SDValue();
24008}
24009
24010/// \return true if part of the index was folded into the Base.
24011static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24012 SDLoc DL, SelectionDAG &DAG) {
24013 // This function assumes a vector of i64 indices.
24014 EVT IndexVT = Index.getValueType();
24015 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24016 return false;
24017
24018 // Simplify:
24019 // BasePtr = Ptr
24020 // Index = X + splat(Offset)
24021 // ->
24022 // BasePtr = Ptr + Offset * scale.
24023 // Index = X
24024 if (Index.getOpcode() == ISD::ADD) {
24025 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
24026 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24027 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24028 Index = Index.getOperand(0);
24029 return true;
24030 }
24031 }
24032
24033 // Simplify:
24034 // BasePtr = Ptr
24035 // Index = (X + splat(Offset)) << splat(Shift)
24036 // ->
24037 // BasePtr = Ptr + (Offset << Shift) * scale)
24038 // Index = X << splat(shift)
24039 if (Index.getOpcode() == ISD::SHL &&
24040 Index.getOperand(0).getOpcode() == ISD::ADD) {
24041 SDValue Add = Index.getOperand(0);
24042 SDValue ShiftOp = Index.getOperand(1);
24043 SDValue OffsetOp = Add.getOperand(1);
24044 if (auto Shift = DAG.getSplatValue(ShiftOp))
24045 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
24046 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
24047 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24048 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24049 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
24050 Add.getOperand(0), ShiftOp);
24051 return true;
24052 }
24053 }
24054
24055 return false;
24056}
24057
24058// Analyse the specified address returning true if a more optimal addressing
24059// mode is available. When returning true all parameters are updated to reflect
24060// their recommended values.
24062 SDValue &BasePtr, SDValue &Index,
24063 SelectionDAG &DAG) {
24064 // Try to iteratively fold parts of the index into the base pointer to
24065 // simplify the index as much as possible.
24066 bool Changed = false;
24067 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
24068 Changed = true;
24069
24070 // Only consider element types that are pointer sized as smaller types can
24071 // be easily promoted.
24072 EVT IndexVT = Index.getValueType();
24073 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24074 return Changed;
24075
24076 // Can indices be trivially shrunk?
24077 EVT DataVT = N->getOperand(1).getValueType();
24078 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24079 // will later be re-extended to 64 bits in legalization
24080 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24081 return Changed;
24082 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
24083 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24084 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
24085 return true;
24086 }
24087
24088 // Match:
24089 // Index = step(const)
24090 int64_t Stride = 0;
24091 if (Index.getOpcode() == ISD::STEP_VECTOR) {
24092 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
24093 }
24094 // Match:
24095 // Index = step(const) << shift(const)
24096 else if (Index.getOpcode() == ISD::SHL &&
24097 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
24098 SDValue RHS = Index.getOperand(1);
24099 if (auto *Shift =
24100 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
24101 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
24102 Stride = Step << Shift->getZExtValue();
24103 }
24104 }
24105
24106 // Return early because no supported pattern is found.
24107 if (Stride == 0)
24108 return Changed;
24109
24110 if (Stride < std::numeric_limits<int32_t>::min() ||
24111 Stride > std::numeric_limits<int32_t>::max())
24112 return Changed;
24113
24114 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24115 unsigned MaxVScale =
24117 int64_t LastElementOffset =
24118 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24119
24120 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
24121 LastElementOffset > std::numeric_limits<int32_t>::max())
24122 return Changed;
24123
24124 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24125 // Stride does not scale explicitly by 'Scale', because it happens in
24126 // the gather/scatter addressing mode.
24127 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
24128 return true;
24129}
24130
24133 if (!DCI.isBeforeLegalize())
24134 return SDValue();
24135 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
24136
24137 SDLoc DL(MGS);
24138 SDValue Chain = MGS->getChain();
24139 SDValue Scale = MGS->getScale();
24140 SDValue Index = MGS->getIndex();
24141 SDValue Mask = MGS->getMask();
24142 SDValue BasePtr = MGS->getBasePtr();
24143 ISD::MemIndexType IndexType = MGS->getIndexType();
24144
24145 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
24146 return SDValue();
24147
24148 // Here we catch such cases early and change MGATHER's IndexType to allow
24149 // the use of an Index that's more legalisation friendly.
24150 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
24151 SDValue PassThru = MGT->getPassThru();
24152 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24153 return DAG.getMaskedGather(
24154 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
24155 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
24156 }
24157 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
24158 SDValue Data = MSC->getValue();
24159 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24160 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
24161 DL, Ops, MSC->getMemOperand(), IndexType,
24162 MSC->isTruncatingStore());
24163 }
24164 auto *HG = cast<MaskedHistogramSDNode>(MGS);
24165 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24166 Index, Scale, HG->getIntID()};
24167 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24168 DL, Ops, HG->getMemOperand(), IndexType);
24169}
24170
24171/// Target-specific DAG combine function for NEON load/store intrinsics
24172/// to merge base address updates.
24175 SelectionDAG &DAG) {
24176 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24177 return SDValue();
24178
24179 unsigned AddrOpIdx = N->getNumOperands() - 1;
24180 SDValue Addr = N->getOperand(AddrOpIdx);
24181
24182 // Search for a use of the address operand that is an increment.
24183 for (SDUse &Use : Addr->uses()) {
24184 SDNode *User = Use.getUser();
24185 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24186 continue;
24187
24188 // Check that the add is independent of the load/store. Otherwise, folding
24189 // it would create a cycle.
24192 Visited.insert(Addr.getNode());
24193 Worklist.push_back(N);
24194 Worklist.push_back(User);
24195 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
24196 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24197 continue;
24198
24199 // Find the new opcode for the updating load/store.
24200 bool IsStore = false;
24201 bool IsLaneOp = false;
24202 bool IsDupOp = false;
24203 unsigned NewOpc = 0;
24204 unsigned NumVecs = 0;
24205 unsigned IntNo = N->getConstantOperandVal(1);
24206 switch (IntNo) {
24207 default: llvm_unreachable("unexpected intrinsic for Neon base update");
24208 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
24209 NumVecs = 2; break;
24210 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
24211 NumVecs = 3; break;
24212 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
24213 NumVecs = 4; break;
24214 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
24215 NumVecs = 2; IsStore = true; break;
24216 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
24217 NumVecs = 3; IsStore = true; break;
24218 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
24219 NumVecs = 4; IsStore = true; break;
24220 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
24221 NumVecs = 2; break;
24222 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
24223 NumVecs = 3; break;
24224 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
24225 NumVecs = 4; break;
24226 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
24227 NumVecs = 2; IsStore = true; break;
24228 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
24229 NumVecs = 3; IsStore = true; break;
24230 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
24231 NumVecs = 4; IsStore = true; break;
24232 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
24233 NumVecs = 2; IsDupOp = true; break;
24234 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
24235 NumVecs = 3; IsDupOp = true; break;
24236 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
24237 NumVecs = 4; IsDupOp = true; break;
24238 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
24239 NumVecs = 2; IsLaneOp = true; break;
24240 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
24241 NumVecs = 3; IsLaneOp = true; break;
24242 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
24243 NumVecs = 4; IsLaneOp = true; break;
24244 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
24245 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
24246 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
24247 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
24248 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
24249 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
24250 }
24251
24252 EVT VecTy;
24253 if (IsStore)
24254 VecTy = N->getOperand(2).getValueType();
24255 else
24256 VecTy = N->getValueType(0);
24257
24258 // If the increment is a constant, it must match the memory ref size.
24259 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24260 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24261 uint32_t IncVal = CInc->getZExtValue();
24262 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
24263 if (IsLaneOp || IsDupOp)
24264 NumBytes /= VecTy.getVectorNumElements();
24265 if (IncVal != NumBytes)
24266 continue;
24267 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24268 }
24270 Ops.push_back(N->getOperand(0)); // Incoming chain
24271 // Load lane and store have vector list as input.
24272 if (IsLaneOp || IsStore)
24273 for (unsigned i = 2; i < AddrOpIdx; ++i)
24274 Ops.push_back(N->getOperand(i));
24275 Ops.push_back(Addr); // Base register
24276 Ops.push_back(Inc);
24277
24278 // Return Types.
24279 EVT Tys[6];
24280 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
24281 unsigned n;
24282 for (n = 0; n < NumResultVecs; ++n)
24283 Tys[n] = VecTy;
24284 Tys[n++] = MVT::i64; // Type of write back register
24285 Tys[n] = MVT::Other; // Type of the chain
24286 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
24287
24288 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
24289 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
24290 MemInt->getMemoryVT(),
24291 MemInt->getMemOperand());
24292
24293 // Update the uses.
24294 std::vector<SDValue> NewResults;
24295 for (unsigned i = 0; i < NumResultVecs; ++i) {
24296 NewResults.push_back(SDValue(UpdN.getNode(), i));
24297 }
24298 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
24299 DCI.CombineTo(N, NewResults);
24300 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
24301
24302 break;
24303 }
24304 return SDValue();
24305}
24306
24307// Checks to see if the value is the prescribed width and returns information
24308// about its extension mode.
24309static
24310bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
24311 ExtType = ISD::NON_EXTLOAD;
24312 switch(V.getNode()->getOpcode()) {
24313 default:
24314 return false;
24315 case ISD::LOAD: {
24316 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
24317 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
24318 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
24319 ExtType = LoadNode->getExtensionType();
24320 return true;
24321 }
24322 return false;
24323 }
24324 case ISD::AssertSext: {
24325 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24326 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24327 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24328 ExtType = ISD::SEXTLOAD;
24329 return true;
24330 }
24331 return false;
24332 }
24333 case ISD::AssertZext: {
24334 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24335 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24336 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24337 ExtType = ISD::ZEXTLOAD;
24338 return true;
24339 }
24340 return false;
24341 }
24342 case ISD::Constant:
24343 case ISD::TargetConstant: {
24344 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
24345 1LL << (width - 1);
24346 }
24347 }
24348
24349 return true;
24350}
24351
24352// This function does a whole lot of voodoo to determine if the tests are
24353// equivalent without and with a mask. Essentially what happens is that given a
24354// DAG resembling:
24355//
24356// +-------------+ +-------------+ +-------------+ +-------------+
24357// | Input | | AddConstant | | CompConstant| | CC |
24358// +-------------+ +-------------+ +-------------+ +-------------+
24359// | | | |
24360// V V | +----------+
24361// +-------------+ +----+ | |
24362// | ADD | |0xff| | |
24363// +-------------+ +----+ | |
24364// | | | |
24365// V V | |
24366// +-------------+ | |
24367// | AND | | |
24368// +-------------+ | |
24369// | | |
24370// +-----+ | |
24371// | | |
24372// V V V
24373// +-------------+
24374// | CMP |
24375// +-------------+
24376//
24377// The AND node may be safely removed for some combinations of inputs. In
24378// particular we need to take into account the extension type of the Input,
24379// the exact values of AddConstant, CompConstant, and CC, along with the nominal
24380// width of the input (this can work for any width inputs, the above graph is
24381// specific to 8 bits.
24382//
24383// The specific equations were worked out by generating output tables for each
24384// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
24385// problem was simplified by working with 4 bit inputs, which means we only
24386// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
24387// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
24388// patterns present in both extensions (0,7). For every distinct set of
24389// AddConstant and CompConstants bit patterns we can consider the masked and
24390// unmasked versions to be equivalent if the result of this function is true for
24391// all 16 distinct bit patterns of for the current extension type of Input (w0).
24392//
24393// sub w8, w0, w1
24394// and w10, w8, #0x0f
24395// cmp w8, w2
24396// cset w9, AArch64CC
24397// cmp w10, w2
24398// cset w11, AArch64CC
24399// cmp w9, w11
24400// cset w0, eq
24401// ret
24402//
24403// Since the above function shows when the outputs are equivalent it defines
24404// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
24405// would be expensive to run during compiles. The equations below were written
24406// in a test harness that confirmed they gave equivalent outputs to the above
24407// for all inputs function, so they can be used determine if the removal is
24408// legal instead.
24409//
24410// isEquivalentMaskless() is the code for testing if the AND can be removed
24411// factored out of the DAG recognition as the DAG can take several forms.
24412
24413static bool isEquivalentMaskless(unsigned CC, unsigned width,
24414 ISD::LoadExtType ExtType, int AddConstant,
24415 int CompConstant) {
24416 // By being careful about our equations and only writing the in term
24417 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
24418 // make them generally applicable to all bit widths.
24419 int MaxUInt = (1 << width);
24420
24421 // For the purposes of these comparisons sign extending the type is
24422 // equivalent to zero extending the add and displacing it by half the integer
24423 // width. Provided we are careful and make sure our equations are valid over
24424 // the whole range we can just adjust the input and avoid writing equations
24425 // for sign extended inputs.
24426 if (ExtType == ISD::SEXTLOAD)
24427 AddConstant -= (1 << (width-1));
24428
24429 switch(CC) {
24430 case AArch64CC::LE:
24431 case AArch64CC::GT:
24432 if ((AddConstant == 0) ||
24433 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
24434 (AddConstant >= 0 && CompConstant < 0) ||
24435 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
24436 return true;
24437 break;
24438 case AArch64CC::LT:
24439 case AArch64CC::GE:
24440 if ((AddConstant == 0) ||
24441 (AddConstant >= 0 && CompConstant <= 0) ||
24442 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
24443 return true;
24444 break;
24445 case AArch64CC::HI:
24446 case AArch64CC::LS:
24447 if ((AddConstant >= 0 && CompConstant < 0) ||
24448 (AddConstant <= 0 && CompConstant >= -1 &&
24449 CompConstant < AddConstant + MaxUInt))
24450 return true;
24451 break;
24452 case AArch64CC::PL:
24453 case AArch64CC::MI:
24454 if ((AddConstant == 0) ||
24455 (AddConstant > 0 && CompConstant <= 0) ||
24456 (AddConstant < 0 && CompConstant <= AddConstant))
24457 return true;
24458 break;
24459 case AArch64CC::LO:
24460 case AArch64CC::HS:
24461 if ((AddConstant >= 0 && CompConstant <= 0) ||
24462 (AddConstant <= 0 && CompConstant >= 0 &&
24463 CompConstant <= AddConstant + MaxUInt))
24464 return true;
24465 break;
24466 case AArch64CC::EQ:
24467 case AArch64CC::NE:
24468 if ((AddConstant > 0 && CompConstant < 0) ||
24469 (AddConstant < 0 && CompConstant >= 0 &&
24470 CompConstant < AddConstant + MaxUInt) ||
24471 (AddConstant >= 0 && CompConstant >= 0 &&
24472 CompConstant >= AddConstant) ||
24473 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
24474 return true;
24475 break;
24476 case AArch64CC::VS:
24477 case AArch64CC::VC:
24478 case AArch64CC::AL:
24479 case AArch64CC::NV:
24480 return true;
24481 case AArch64CC::Invalid:
24482 break;
24483 }
24484
24485 return false;
24486}
24487
24488// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
24489// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
24491 SDNode *AndNode, SelectionDAG &DAG,
24492 unsigned CCIndex, unsigned CmpIndex,
24493 unsigned CC) {
24494 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
24495 if (!SubsC)
24496 return SDValue();
24497
24498 APInt SubsAP = SubsC->getAPIntValue();
24499 if (CC == AArch64CC::HI) {
24500 if (!SubsAP.isMask())
24501 return SDValue();
24502 } else if (CC == AArch64CC::LO) {
24503 if (!SubsAP.isPowerOf2())
24504 return SDValue();
24505 } else
24506 return SDValue();
24507
24508 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
24509 if (!AndC)
24510 return SDValue();
24511
24512 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
24513
24514 SDLoc DL(N);
24515 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
24516 SDValue ANDS = DAG.getNode(
24517 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
24518 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
24519 SDValue AArch64_CC =
24521 N->getOperand(CCIndex)->getValueType(0));
24522
24523 // For now, only performCSELCombine and performBRCONDCombine call this
24524 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
24525 // operands. So just init the ops direct to simplify the code. If we have some
24526 // other case with different CCIndex, CmpIndex, we need to use for loop to
24527 // rewrite the code here.
24528 // TODO: Do we need to assert number of operand is 4 here?
24529 assert((CCIndex == 2 && CmpIndex == 3) &&
24530 "Expected CCIndex to be 2 and CmpIndex to be 3.");
24531 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
24532 ANDS.getValue(1)};
24533 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
24534}
24535
24536static
24539 SelectionDAG &DAG, unsigned CCIndex,
24540 unsigned CmpIndex) {
24541 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
24542 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
24543 unsigned CondOpcode = SubsNode->getOpcode();
24544
24545 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
24546 !SubsNode->hasOneUse())
24547 return SDValue();
24548
24549 // There is a SUBS feeding this condition. Is it fed by a mask we can
24550 // use?
24551
24552 SDNode *AndNode = SubsNode->getOperand(0).getNode();
24553 unsigned MaskBits = 0;
24554
24555 if (AndNode->getOpcode() != ISD::AND)
24556 return SDValue();
24557
24558 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
24559 CmpIndex, CC))
24560 return Val;
24561
24562 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
24563 uint32_t CNV = CN->getZExtValue();
24564 if (CNV == 255)
24565 MaskBits = 8;
24566 else if (CNV == 65535)
24567 MaskBits = 16;
24568 }
24569
24570 if (!MaskBits)
24571 return SDValue();
24572
24573 SDValue AddValue = AndNode->getOperand(0);
24574
24575 if (AddValue.getOpcode() != ISD::ADD)
24576 return SDValue();
24577
24578 // The basic dag structure is correct, grab the inputs and validate them.
24579
24580 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
24581 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
24582 SDValue SubsInputValue = SubsNode->getOperand(1);
24583
24584 // The mask is present and the provenance of all the values is a smaller type,
24585 // lets see if the mask is superfluous.
24586
24587 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
24588 !isa<ConstantSDNode>(SubsInputValue.getNode()))
24589 return SDValue();
24590
24591 ISD::LoadExtType ExtType;
24592
24593 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
24594 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
24595 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
24596 return SDValue();
24597
24598 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
24599 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
24600 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
24601 return SDValue();
24602
24603 // The AND is not necessary, remove it.
24604
24605 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
24606 SubsNode->getValueType(1));
24607 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
24608
24609 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
24610 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
24611
24612 return SDValue(N, 0);
24613}
24614
24615// Optimize compare with zero and branch.
24618 SelectionDAG &DAG) {
24620 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
24621 // will not be produced, as they are conditional branch instructions that do
24622 // not set flags.
24623 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
24624 return SDValue();
24625
24626 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
24627 N = NV.getNode();
24628 SDValue Chain = N->getOperand(0);
24629 SDValue Dest = N->getOperand(1);
24630 SDValue CCVal = N->getOperand(2);
24631 SDValue Cmp = N->getOperand(3);
24632
24633 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
24634 unsigned CC = CCVal->getAsZExtVal();
24635 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
24636 return SDValue();
24637
24638 unsigned CmpOpc = Cmp.getOpcode();
24639 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
24640 return SDValue();
24641
24642 // Only attempt folding if there is only one use of the flag and no use of the
24643 // value.
24644 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
24645 return SDValue();
24646
24647 SDValue LHS = Cmp.getOperand(0);
24648 SDValue RHS = Cmp.getOperand(1);
24649
24650 assert(LHS.getValueType() == RHS.getValueType() &&
24651 "Expected the value type to be the same for both operands!");
24652 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
24653 return SDValue();
24654
24655 if (isNullConstant(LHS))
24656 std::swap(LHS, RHS);
24657
24658 if (!isNullConstant(RHS))
24659 return SDValue();
24660
24661 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
24662 LHS.getOpcode() == ISD::SRL)
24663 return SDValue();
24664
24665 // Fold the compare into the branch instruction.
24666 SDValue BR;
24667 if (CC == AArch64CC::EQ)
24668 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
24669 else
24670 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
24671
24672 // Do not add new nodes to DAG combiner worklist.
24673 DCI.CombineTo(N, BR, false);
24674
24675 return SDValue();
24676}
24677
24679 unsigned CC = N->getConstantOperandVal(2);
24680 SDValue SUBS = N->getOperand(3);
24681 SDValue Zero, CTTZ;
24682
24683 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
24684 Zero = N->getOperand(0);
24685 CTTZ = N->getOperand(1);
24686 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
24687 Zero = N->getOperand(1);
24688 CTTZ = N->getOperand(0);
24689 } else
24690 return SDValue();
24691
24692 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
24693 (CTTZ.getOpcode() == ISD::TRUNCATE &&
24694 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
24695 return SDValue();
24696
24697 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
24698 "Illegal type in CTTZ folding");
24699
24700 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
24701 return SDValue();
24702
24703 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
24704 ? CTTZ.getOperand(0).getOperand(0)
24705 : CTTZ.getOperand(0);
24706
24707 if (X != SUBS.getOperand(0))
24708 return SDValue();
24709
24710 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
24711 ? CTTZ.getOperand(0).getValueSizeInBits()
24712 : CTTZ.getValueSizeInBits();
24713 SDValue BitWidthMinusOne =
24714 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
24715 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
24716 BitWidthMinusOne);
24717}
24718
24719// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
24720// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
24721// Where x and y are constants and x != y
24722
24723// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
24724// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
24725// Where x and y are constants and x != y
24727 SDValue L = Op->getOperand(0);
24728 SDValue R = Op->getOperand(1);
24729 AArch64CC::CondCode OpCC =
24730 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
24731
24732 SDValue OpCmp = Op->getOperand(3);
24733 if (!isCMP(OpCmp))
24734 return SDValue();
24735
24736 SDValue CmpLHS = OpCmp.getOperand(0);
24737 SDValue CmpRHS = OpCmp.getOperand(1);
24738
24739 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
24740 std::swap(CmpLHS, CmpRHS);
24741 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
24742 return SDValue();
24743
24744 SDValue X = CmpLHS->getOperand(0);
24745 SDValue Y = CmpLHS->getOperand(1);
24746 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
24747 return SDValue();
24748 }
24749
24750 // If one of the constant is opaque constant, x,y sdnode is still different
24751 // but the real value maybe the same. So check APInt here to make sure the
24752 // code is correct.
24753 ConstantSDNode *CX = cast<ConstantSDNode>(X);
24754 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
24755 if (CX->getAPIntValue() == CY->getAPIntValue())
24756 return SDValue();
24757
24759 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
24760 SDValue Cond = CmpLHS->getOperand(3);
24761
24762 if (CmpRHS == Y)
24764 else if (CmpRHS != X)
24765 return SDValue();
24766
24767 if (OpCC == AArch64CC::NE)
24769 else if (OpCC != AArch64CC::EQ)
24770 return SDValue();
24771
24772 SDLoc DL(Op);
24773 EVT VT = Op->getValueType(0);
24774
24775 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
24776 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
24777}
24778
24779// Reassociate the true/false expressions of a CSEL instruction to obtain a
24780// common subexpression with the comparison instruction. For example, change
24781// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
24782// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
24783// subexpression.
24785 SDValue SubsNode = N->getOperand(3);
24786 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
24787 return SDValue();
24788
24789 SDValue CmpOpToMatch = SubsNode.getOperand(1);
24790 SDValue CmpOpOther = SubsNode.getOperand(0);
24791 EVT VT = N->getValueType(0);
24792
24793 unsigned ExpectedOpcode;
24794 SDValue ExpectedOp;
24795 SDValue SubsOp;
24796 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
24797 if (CmpOpConst) {
24798 ExpectedOpcode = ISD::ADD;
24799 ExpectedOp =
24800 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
24801 CmpOpConst->getValueType(0));
24802 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
24803 CmpOpConst->getValueType(0));
24804 } else {
24805 ExpectedOpcode = ISD::SUB;
24806 ExpectedOp = CmpOpToMatch;
24807 SubsOp = CmpOpToMatch;
24808 }
24809
24810 // Get the operand that can be reassociated with the SUBS instruction.
24811 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
24812 if (Op.getOpcode() != ExpectedOpcode)
24813 return SDValue();
24814 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
24815 !Op.getOperand(0).hasOneUse())
24816 return SDValue();
24817 SDValue X = Op.getOperand(0).getOperand(0);
24818 SDValue Y = Op.getOperand(0).getOperand(1);
24819 if (X != CmpOpOther)
24820 std::swap(X, Y);
24821 if (X != CmpOpOther)
24822 return SDValue();
24823 if (ExpectedOp != Op.getOperand(1))
24824 return SDValue();
24825 return Y;
24826 };
24827
24828 // Try the reassociation using the given constant and condition code.
24829 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
24830 SDValue SubsOp) {
24831 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
24832 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
24833 if (!TReassocOp && !FReassocOp)
24834 return SDValue();
24835
24836 SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
24837 DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
24838
24839 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
24840 if (!ReassocOp)
24841 return N->getOperand(OpNum);
24842 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
24843 NewCmp.getValue(0), ReassocOp);
24844 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
24845 return Res;
24846 };
24847
24848 SDValue TValReassoc = Reassociate(TReassocOp, 0);
24849 SDValue FValReassoc = Reassociate(FReassocOp, 1);
24850 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
24851 DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC),
24852 NewCmp.getValue(1));
24853 };
24854
24855 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
24856
24857 // First, try to eliminate the compare instruction by searching for a
24858 // subtraction with the same constant.
24859 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
24860 return R;
24861
24862 if (!CmpOpConst) {
24863 // Try again with the operands of the SUBS instruction and the condition
24864 // swapped. Due to canonicalization, this only helps for non-constant
24865 // operands of the SUBS instruction.
24866 std::swap(CmpOpToMatch, CmpOpOther);
24867 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
24868 return R;
24869 return SDValue();
24870 }
24871
24872 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
24873 return SDValue();
24874
24875 // Next, search for a subtraction with a slightly different constant. By
24876 // adjusting the condition code, we can still eliminate the compare
24877 // instruction. Adjusting the constant is only valid if it does not result
24878 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
24879 // Since such comparisons are trivially true/false, we should not encounter
24880 // them here but check for them nevertheless to be on the safe side.
24881 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
24882 AArch64CC::CondCode NewCC) {
24883 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
24884 CmpOpConst->getValueType(0));
24885 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
24886 CmpOpConst->getValueType(0));
24887 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
24888 };
24889 switch (CC) {
24890 case AArch64CC::EQ:
24891 case AArch64CC::LS:
24892 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
24893 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
24894 case AArch64CC::NE:
24895 case AArch64CC::HI:
24896 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
24897 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
24898 case AArch64CC::LO:
24899 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
24900 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
24901 case AArch64CC::HS:
24902 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
24903 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
24904 case AArch64CC::LT:
24905 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
24906 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
24907 case AArch64CC::LE:
24908 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
24909 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
24910 case AArch64CC::GT:
24911 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
24912 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
24913 case AArch64CC::GE:
24914 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
24915 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
24916 default:
24917 return SDValue();
24918 }
24919}
24920
24921// Optimize CSEL instructions
24924 SelectionDAG &DAG) {
24925 // CSEL x, x, cc -> x
24926 if (N->getOperand(0) == N->getOperand(1))
24927 return N->getOperand(0);
24928
24929 if (SDValue R = foldCSELOfCSEL(N, DAG))
24930 return R;
24931
24932 // Try to reassociate the true/false expressions so that we can do CSE with
24933 // a SUBS instruction used to perform the comparison.
24935 return R;
24936
24937 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
24938 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
24939 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
24940 return Folded;
24941
24942 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
24943 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
24944 SDValue Cond = N->getOperand(3);
24945 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
24946 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
24947 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
24948 {Cond.getOperand(1), Cond.getOperand(0)}) &&
24949 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
24950 {Cond.getOperand(0), Cond.getOperand(1)}) &&
24951 !isNullConstant(Cond.getOperand(1))) {
24952 AArch64CC::CondCode OldCond =
24953 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
24954 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
24955 if (NewCond != AArch64CC::AL) {
24956 SDLoc DL(N);
24957 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
24958 Cond.getOperand(1), Cond.getOperand(0));
24959 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
24960 N->getOperand(1),
24961 DAG.getConstant(NewCond, DL, MVT::i32),
24962 Sub.getValue(1));
24963 }
24964 }
24965
24966 return performCONDCombine(N, DCI, DAG, 2, 3);
24967}
24968
24969// Try to re-use an already extended operand of a vector SetCC feeding a
24970// extended select. Doing so avoids requiring another full extension of the
24971// SET_CC result when lowering the select.
24973 EVT Op0MVT = Op->getOperand(0).getValueType();
24974 if (!Op0MVT.isVector() || Op->use_empty())
24975 return SDValue();
24976
24977 // Make sure that all uses of Op are VSELECTs with result matching types where
24978 // the result type has a larger element type than the SetCC operand.
24979 SDNode *FirstUse = *Op->user_begin();
24980 if (FirstUse->getOpcode() != ISD::VSELECT)
24981 return SDValue();
24982 EVT UseMVT = FirstUse->getValueType(0);
24983 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
24984 return SDValue();
24985 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
24986 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
24987 }))
24988 return SDValue();
24989
24990 APInt V;
24991 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
24992 return SDValue();
24993
24994 SDLoc DL(Op);
24995 SDValue Op0ExtV;
24996 SDValue Op1ExtV;
24997 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
24998 // Check if the first operand of the SET_CC is already extended. If it is,
24999 // split the SET_CC and re-use the extended version of the operand.
25000 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
25001 Op->getOperand(0));
25002 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
25003 Op->getOperand(0));
25004 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25005 Op0ExtV = SDValue(Op0SExt, 0);
25006 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
25007 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25008 Op0ExtV = SDValue(Op0ZExt, 0);
25009 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
25010 } else
25011 return SDValue();
25012
25013 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
25014 Op0ExtV, Op1ExtV, Op->getOperand(2));
25015}
25016
25017static SDValue
25019 SelectionDAG &DAG) {
25020 SDValue Vec = N->getOperand(0);
25021 if (DCI.isBeforeLegalize() &&
25022 Vec.getValueType().getVectorElementType() == MVT::i1 &&
25025 SDLoc DL(N);
25026 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
25027 DAG);
25028 }
25029
25030 return SDValue();
25031}
25032
25035 SelectionDAG &DAG) {
25036 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
25037 SDValue LHS = N->getOperand(0);
25038 SDValue RHS = N->getOperand(1);
25039 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
25040 SDLoc DL(N);
25041 EVT VT = N->getValueType(0);
25042
25043 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
25044 return V;
25045
25046 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
25047 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
25048 LHS->getOpcode() == AArch64ISD::CSEL &&
25049 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
25050 LHS->hasOneUse()) {
25051 // Invert CSEL's condition.
25052 auto OldCond =
25053 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
25054 auto NewCond = getInvertedCondCode(OldCond);
25055
25056 // csel 0, 1, !cond, X
25057 SDValue CSEL =
25058 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
25059 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
25060 LHS.getOperand(3));
25061 return DAG.getZExtOrTrunc(CSEL, DL, VT);
25062 }
25063
25064 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
25065 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
25066 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
25067 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
25068 LHS->hasOneUse()) {
25069 EVT TstVT = LHS->getValueType(0);
25070 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
25071 // this pattern will get better opt in emitComparison
25072 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
25073 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
25074 DAG.getSignedConstant(TstImm, DL, TstVT));
25075 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
25076 }
25077 }
25078
25079 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
25080 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
25081 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
25082 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
25083 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25084 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
25086 LHS->getOpcode() == ISD::BITCAST) {
25087 EVT ToVT = LHS->getValueType(0);
25088 EVT FromVT = LHS->getOperand(0).getValueType();
25089 if (FromVT.isFixedLengthVector() &&
25090 FromVT.getVectorElementType() == MVT::i1) {
25091 bool IsNull = isNullConstant(RHS);
25093 DL, MVT::i1, LHS->getOperand(0));
25094 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
25095 LHS);
25096 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25097 }
25098 }
25099
25100 // Try to perform the memcmp when the result is tested for [in]equality with 0
25101 if (SDValue V = performOrXorChainCombine(N, DAG))
25102 return V;
25103
25104 return SDValue();
25105}
25106
25107// Replace a flag-setting operator (eg ANDS) with the generic version
25108// (eg AND) if the flag is unused.
25111 unsigned GenericOpcode) {
25112 SDLoc DL(N);
25113 SDValue LHS = N->getOperand(0);
25114 SDValue RHS = N->getOperand(1);
25115 EVT VT = N->getValueType(0);
25116
25117 // If the flag result isn't used, convert back to a generic opcode.
25118 if (!N->hasAnyUseOfValue(1)) {
25119 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
25120 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
25121 DL);
25122 }
25123
25124 // Combine identical generic nodes into this node, re-using the result.
25125 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
25126 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
25127 DCI.CombineTo(Generic, SDValue(N, 0));
25128
25129 return SDValue();
25130}
25131
25133 // setcc_merge_zero pred
25134 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
25135 // => extract_subvector (inner setcc_merge_zero)
25136 SDValue Pred = N->getOperand(0);
25137 SDValue LHS = N->getOperand(1);
25138 SDValue RHS = N->getOperand(2);
25139 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25140
25141 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
25142 LHS->getOpcode() != ISD::SIGN_EXTEND)
25143 return SDValue();
25144
25145 SDValue Extract = LHS->getOperand(0);
25146 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25147 Extract->getValueType(0) != N->getValueType(0) ||
25148 Extract->getConstantOperandVal(1) != 0)
25149 return SDValue();
25150
25151 SDValue InnerSetCC = Extract->getOperand(0);
25152 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
25153 return SDValue();
25154
25155 // By this point we've effectively got
25156 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
25157 // lanes are already zero then the trunc(sext()) sequence is redundant and we
25158 // can operate on A directly.
25159 SDValue InnerPred = InnerSetCC.getOperand(0);
25160 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
25161 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
25162 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
25163 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
25164 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
25165 return Extract;
25166
25167 return SDValue();
25168}
25169
25170static SDValue
25172 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25173 "Unexpected opcode!");
25174
25175 SelectionDAG &DAG = DCI.DAG;
25176 SDValue Pred = N->getOperand(0);
25177 SDValue LHS = N->getOperand(1);
25178 SDValue RHS = N->getOperand(2);
25179 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25180
25181 if (SDValue V = performSetCCPunpkCombine(N, DAG))
25182 return V;
25183
25184 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
25185 LHS->getOpcode() == ISD::SIGN_EXTEND &&
25186 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
25187 // setcc_merge_zero(
25188 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
25189 // => setcc_merge_zero(pred, ...)
25190 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25191 LHS->getOperand(0)->getOperand(0) == Pred)
25192 return LHS->getOperand(0);
25193
25194 // setcc_merge_zero(
25195 // all_active, extend(nxvNi1 ...), != splat(0))
25196 // -> nxvNi1 ...
25197 if (isAllActivePredicate(DAG, Pred))
25198 return LHS->getOperand(0);
25199
25200 // setcc_merge_zero(
25201 // pred, extend(nxvNi1 ...), != splat(0))
25202 // -> nxvNi1 and(pred, ...)
25203 if (DCI.isAfterLegalizeDAG())
25204 // Do this after legalization to allow more folds on setcc_merge_zero
25205 // to be recognized.
25206 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
25207 LHS->getOperand(0), Pred);
25208 }
25209
25210 return SDValue();
25211}
25212
25213// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
25214// as well as whether the test should be inverted. This code is required to
25215// catch these cases (as opposed to standard dag combines) because
25216// AArch64ISD::TBZ is matched during legalization.
25217static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
25218 SelectionDAG &DAG) {
25219
25220 if (!Op->hasOneUse())
25221 return Op;
25222
25223 // We don't handle undef/constant-fold cases below, as they should have
25224 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
25225 // etc.)
25226
25227 // (tbz (trunc x), b) -> (tbz x, b)
25228 // This case is just here to enable more of the below cases to be caught.
25229 if (Op->getOpcode() == ISD::TRUNCATE &&
25230 Bit < Op->getValueType(0).getSizeInBits()) {
25231 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25232 }
25233
25234 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
25235 if (Op->getOpcode() == ISD::ANY_EXTEND &&
25236 Bit < Op->getOperand(0).getValueSizeInBits()) {
25237 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25238 }
25239
25240 if (Op->getNumOperands() != 2)
25241 return Op;
25242
25243 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
25244 if (!C)
25245 return Op;
25246
25247 switch (Op->getOpcode()) {
25248 default:
25249 return Op;
25250
25251 // (tbz (and x, m), b) -> (tbz x, b)
25252 case ISD::AND:
25253 if ((C->getZExtValue() >> Bit) & 1)
25254 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25255 return Op;
25256
25257 // (tbz (shl x, c), b) -> (tbz x, b-c)
25258 case ISD::SHL:
25259 if (C->getZExtValue() <= Bit &&
25260 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25261 Bit = Bit - C->getZExtValue();
25262 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25263 }
25264 return Op;
25265
25266 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
25267 case ISD::SRA:
25268 Bit = Bit + C->getZExtValue();
25269 if (Bit >= Op->getValueType(0).getSizeInBits())
25270 Bit = Op->getValueType(0).getSizeInBits() - 1;
25271 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25272
25273 // (tbz (srl x, c), b) -> (tbz x, b+c)
25274 case ISD::SRL:
25275 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25276 Bit = Bit + C->getZExtValue();
25277 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25278 }
25279 return Op;
25280
25281 // (tbz (xor x, -1), b) -> (tbnz x, b)
25282 case ISD::XOR:
25283 if ((C->getZExtValue() >> Bit) & 1)
25284 Invert = !Invert;
25285 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25286 }
25287}
25288
25289// Optimize test single bit zero/non-zero and branch.
25292 SelectionDAG &DAG) {
25293 unsigned Bit = N->getConstantOperandVal(2);
25294 bool Invert = false;
25295 SDValue TestSrc = N->getOperand(1);
25296 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
25297
25298 if (TestSrc == NewTestSrc)
25299 return SDValue();
25300
25301 unsigned NewOpc = N->getOpcode();
25302 if (Invert) {
25303 if (NewOpc == AArch64ISD::TBZ)
25304 NewOpc = AArch64ISD::TBNZ;
25305 else {
25306 assert(NewOpc == AArch64ISD::TBNZ);
25307 NewOpc = AArch64ISD::TBZ;
25308 }
25309 }
25310
25311 SDLoc DL(N);
25312 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
25313 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
25314}
25315
25316// Swap vselect operands where it may allow a predicated operation to achieve
25317// the `sel`.
25318//
25319// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
25320// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
25322 auto SelectA = N->getOperand(1);
25323 auto SelectB = N->getOperand(2);
25324 auto NTy = N->getValueType(0);
25325
25326 if (!NTy.isScalableVector())
25327 return SDValue();
25328 SDValue SetCC = N->getOperand(0);
25329 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
25330 return SDValue();
25331
25332 switch (SelectB.getOpcode()) {
25333 default:
25334 return SDValue();
25335 case ISD::FMUL:
25336 case ISD::FSUB:
25337 case ISD::FADD:
25338 break;
25339 }
25340 if (SelectA != SelectB.getOperand(0))
25341 return SDValue();
25342
25343 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
25344 ISD::CondCode InverseCC =
25346 auto InverseSetCC =
25347 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
25348 SetCC.getOperand(1), InverseCC);
25349
25350 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
25351 {InverseSetCC, SelectB, SelectA});
25352}
25353
25354// vselect (v1i1 setcc) ->
25355// vselect (v1iXX setcc) (XX is the size of the compared operand type)
25356// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
25357// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
25358// such VSELECT.
25360 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
25361 return SwapResult;
25362
25363 SDValue N0 = N->getOperand(0);
25364 EVT CCVT = N0.getValueType();
25365
25366 if (isAllActivePredicate(DAG, N0))
25367 return N->getOperand(1);
25368
25369 if (isAllInactivePredicate(N0))
25370 return N->getOperand(2);
25371
25372 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
25373 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
25374 // supported types.
25375 SDValue SetCC = N->getOperand(0);
25376 if (SetCC.getOpcode() == ISD::SETCC &&
25377 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
25378 SDValue CmpLHS = SetCC.getOperand(0);
25379 EVT VT = CmpLHS.getValueType();
25380 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
25381 SDNode *SplatLHS = N->getOperand(1).getNode();
25382 SDNode *SplatRHS = N->getOperand(2).getNode();
25383 APInt SplatLHSVal;
25384 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
25385 VT.isSimple() &&
25386 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
25387 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
25388 VT.getSimpleVT().SimpleTy) &&
25389 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
25390 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
25392 unsigned NumElts = VT.getVectorNumElements();
25394 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
25395 VT.getScalarType()));
25396 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
25397
25398 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
25399 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
25400 return Or;
25401 }
25402 }
25403
25404 EVT CmpVT = N0.getOperand(0).getValueType();
25405 if (N0.getOpcode() != ISD::SETCC ||
25407 CCVT.getVectorElementType() != MVT::i1 ||
25409 return SDValue();
25410
25411 EVT ResVT = N->getValueType(0);
25412 // Only combine when the result type is of the same size as the compared
25413 // operands.
25414 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
25415 return SDValue();
25416
25417 SDValue IfTrue = N->getOperand(1);
25418 SDValue IfFalse = N->getOperand(2);
25419 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
25420 N0.getOperand(0), N0.getOperand(1),
25421 cast<CondCodeSDNode>(N0.getOperand(2))->get());
25422 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
25423 IfTrue, IfFalse);
25424}
25425
25426/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
25427/// the compare-mask instructions rather than going via NZCV, even if LHS and
25428/// RHS are really scalar. This replaces any scalar setcc in the above pattern
25429/// with a vector one followed by a DUP shuffle on the result.
25432 SelectionDAG &DAG = DCI.DAG;
25433 SDValue N0 = N->getOperand(0);
25434 EVT ResVT = N->getValueType(0);
25435
25436 if (N0.getOpcode() != ISD::SETCC)
25437 return SDValue();
25438
25439 if (ResVT.isScalableVT())
25440 return SDValue();
25441
25442 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
25443 // scalar SetCCResultType. We also don't expect vectors, because we assume
25444 // that selects fed by vector SETCCs are canonicalized to VSELECT.
25445 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
25446 "Scalar-SETCC feeding SELECT has unexpected result type!");
25447
25448 // If NumMaskElts == 0, the comparison is larger than select result. The
25449 // largest real NEON comparison is 64-bits per lane, which means the result is
25450 // at most 32-bits and an illegal vector. Just bail out for now.
25451 EVT SrcVT = N0.getOperand(0).getValueType();
25452
25453 // Don't try to do this optimization when the setcc itself has i1 operands.
25454 // There are no legal vectors of i1, so this would be pointless. v1f16 is
25455 // ruled out to prevent the creation of setcc that need to be scalarized.
25456 if (SrcVT == MVT::i1 ||
25457 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
25458 return SDValue();
25459
25460 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
25461 if (!ResVT.isVector() || NumMaskElts == 0)
25462 return SDValue();
25463
25464 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
25466
25467 // Also bail out if the vector CCVT isn't the same size as ResVT.
25468 // This can happen if the SETCC operand size doesn't divide the ResVT size
25469 // (e.g., f64 vs v3f32).
25470 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
25471 return SDValue();
25472
25473 // Make sure we didn't create illegal types, if we're not supposed to.
25474 assert(DCI.isBeforeLegalize() ||
25475 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
25476
25477 // First perform a vector comparison, where lane 0 is the one we're interested
25478 // in.
25479 SDLoc DL(N0);
25480 SDValue LHS =
25481 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
25482 SDValue RHS =
25483 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
25484 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
25485
25486 // Now duplicate the comparison mask we want across all other lanes.
25487 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
25488 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
25489 Mask = DAG.getNode(ISD::BITCAST, DL,
25490 ResVT.changeVectorElementTypeToInteger(), Mask);
25491
25492 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
25493}
25494
25497 EVT VT = N->getValueType(0);
25498 SDLoc DL(N);
25499 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
25500 // 128bit vector version.
25501 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
25503 SmallVector<SDValue> Ops(N->ops());
25504 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
25505 DCI.DAG.getVTList(LVT), Ops)) {
25506 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
25507 DCI.DAG.getConstant(0, DL, MVT::i64));
25508 }
25509 }
25510
25511 if (N->getOpcode() == AArch64ISD::DUP) {
25512 if (DCI.isAfterLegalizeDAG()) {
25513 // If scalar dup's operand is extract_vector_elt, try to combine them into
25514 // duplane. For example,
25515 //
25516 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
25517 // t18: v4i32 = AArch64ISD::DUP t21
25518 // ==>
25519 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
25520 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
25521 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25522 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
25523 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
25524 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
25525 EXTRACT_VEC_ELT.getOperand(1));
25526 }
25527 }
25528 }
25529
25530 return performPostLD1Combine(N, DCI, false);
25531 }
25532
25533 return SDValue();
25534}
25535
25536/// Get rid of unnecessary NVCASTs (that don't change the type).
25538 if (N->getValueType(0) == N->getOperand(0).getValueType())
25539 return N->getOperand(0);
25540 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
25541 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
25542 N->getOperand(0).getOperand(0));
25543
25544 return SDValue();
25545}
25546
25547// If all users of the globaladdr are of the form (globaladdr + constant), find
25548// the smallest constant, fold it into the globaladdr's offset and rewrite the
25549// globaladdr as (globaladdr + constant) - constant.
25551 const AArch64Subtarget *Subtarget,
25552 const TargetMachine &TM) {
25553 auto *GN = cast<GlobalAddressSDNode>(N);
25554 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
25556 return SDValue();
25557
25558 uint64_t MinOffset = -1ull;
25559 for (SDNode *N : GN->users()) {
25560 if (N->getOpcode() != ISD::ADD)
25561 return SDValue();
25562 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
25563 if (!C)
25564 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
25565 if (!C)
25566 return SDValue();
25567 MinOffset = std::min(MinOffset, C->getZExtValue());
25568 }
25569 uint64_t Offset = MinOffset + GN->getOffset();
25570
25571 // Require that the new offset is larger than the existing one. Otherwise, we
25572 // can end up oscillating between two possible DAGs, for example,
25573 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
25574 if (Offset <= uint64_t(GN->getOffset()))
25575 return SDValue();
25576
25577 // Check whether folding this offset is legal. It must not go out of bounds of
25578 // the referenced object to avoid violating the code model, and must be
25579 // smaller than 2^20 because this is the largest offset expressible in all
25580 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
25581 // stores an immediate signed 21 bit offset.)
25582 //
25583 // This check also prevents us from folding negative offsets, which will end
25584 // up being treated in the same way as large positive ones. They could also
25585 // cause code model violations, and aren't really common enough to matter.
25586 if (Offset >= (1 << 20))
25587 return SDValue();
25588
25589 const GlobalValue *GV = GN->getGlobal();
25590 Type *T = GV->getValueType();
25591 if (!T->isSized() ||
25593 return SDValue();
25594
25595 SDLoc DL(GN);
25596 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
25597 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
25598 DAG.getConstant(MinOffset, DL, MVT::i64));
25599}
25600
25602 const AArch64Subtarget *Subtarget) {
25603 SDValue BR = N->getOperand(0);
25604 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
25605 !BR.getValueType().isScalarInteger())
25606 return SDValue();
25607
25608 SDLoc DL(N);
25609 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
25610}
25611
25612// Turns the vector of indices into a vector of byte offstes by scaling Offset
25613// by (BitWidth / 8).
25615 SDLoc DL, unsigned BitWidth) {
25616 assert(Offset.getValueType().isScalableVector() &&
25617 "This method is only for scalable vectors of offsets");
25618
25619 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
25620 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
25621
25622 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
25623}
25624
25625/// Check if the value of \p OffsetInBytes can be used as an immediate for
25626/// the gather load/prefetch and scatter store instructions with vector base and
25627/// immediate offset addressing mode:
25628///
25629/// [<Zn>.[S|D]{, #<imm>}]
25630///
25631/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25632inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
25633 unsigned ScalarSizeInBytes) {
25634 // The immediate is not a multiple of the scalar size.
25635 if (OffsetInBytes % ScalarSizeInBytes)
25636 return false;
25637
25638 // The immediate is out of range.
25639 if (OffsetInBytes / ScalarSizeInBytes > 31)
25640 return false;
25641
25642 return true;
25643}
25644
25645/// Check if the value of \p Offset represents a valid immediate for the SVE
25646/// gather load/prefetch and scatter store instructiona with vector base and
25647/// immediate offset addressing mode:
25648///
25649/// [<Zn>.[S|D]{, #<imm>}]
25650///
25651/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25653 unsigned ScalarSizeInBytes) {
25654 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
25655 return OffsetConst && isValidImmForSVEVecImmAddrMode(
25656 OffsetConst->getZExtValue(), ScalarSizeInBytes);
25657}
25658
25660 unsigned Opcode,
25661 bool OnlyPackedOffsets = true) {
25662 const SDValue Src = N->getOperand(2);
25663 const EVT SrcVT = Src->getValueType(0);
25664 assert(SrcVT.isScalableVector() &&
25665 "Scatter stores are only possible for SVE vectors");
25666
25667 SDLoc DL(N);
25668 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
25669
25670 // Make sure that source data will fit into an SVE register
25672 return SDValue();
25673
25674 // For FPs, ACLE only supports _packed_ single and double precision types.
25675 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
25676 if (SrcElVT.isFloatingPoint())
25677 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
25678 ((Opcode != AArch64ISD::SST1Q_PRED &&
25679 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
25680 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
25681 return SDValue();
25682
25683 // Depending on the addressing mode, this is either a pointer or a vector of
25684 // pointers (that fits into one register)
25685 SDValue Base = N->getOperand(4);
25686 // Depending on the addressing mode, this is either a single offset or a
25687 // vector of offsets (that fits into one register)
25688 SDValue Offset = N->getOperand(5);
25689
25690 // For "scalar + vector of indices", just scale the indices. This only
25691 // applies to non-temporal scatters because there's no instruction that takes
25692 // indices.
25693 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
25694 Offset =
25696 Opcode = AArch64ISD::SSTNT1_PRED;
25697 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
25698 Offset =
25700 Opcode = AArch64ISD::SST1Q_PRED;
25701 }
25702
25703 // In the case of non-temporal gather loads there's only one SVE instruction
25704 // per data-size: "scalar + vector", i.e.
25705 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25706 // Since we do have intrinsics that allow the arguments to be in a different
25707 // order, we may need to swap them to match the spec.
25708 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
25709 Offset.getValueType().isVector())
25711
25712 // SST1_IMM requires that the offset is an immediate that is:
25713 // * a multiple of #SizeInBytes,
25714 // * in the range [0, 31 x #SizeInBytes],
25715 // where #SizeInBytes is the size in bytes of the stored items. For
25716 // immediates outside that range and non-immediate scalar offsets use SST1 or
25717 // SST1_UXTW instead.
25718 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
25720 SrcVT.getScalarSizeInBits() / 8)) {
25721 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25723 else
25724 Opcode = AArch64ISD::SST1_PRED;
25725
25727 }
25728 }
25729
25730 auto &TLI = DAG.getTargetLoweringInfo();
25731 if (!TLI.isTypeLegal(Base.getValueType()))
25732 return SDValue();
25733
25734 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
25735 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25736 // nxv2i64. Legalize accordingly.
25737 if (!OnlyPackedOffsets &&
25738 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25739 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
25740
25741 if (!TLI.isTypeLegal(Offset.getValueType()))
25742 return SDValue();
25743
25744 // Source value type that is representable in hardware
25745 EVT HwSrcVt = getSVEContainerType(SrcVT);
25746
25747 // Keep the original type of the input data to store - this is needed to be
25748 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
25749 // FP values we want the integer equivalent, so just use HwSrcVt.
25750 SDValue InputVT = DAG.getValueType(SrcVT);
25751 if (SrcVT.isFloatingPoint())
25752 InputVT = DAG.getValueType(HwSrcVt);
25753
25754 SDVTList VTs = DAG.getVTList(MVT::Other);
25755 SDValue SrcNew;
25756
25757 if (Src.getValueType().isFloatingPoint())
25758 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
25759 else
25760 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
25761
25762 SDValue Ops[] = {N->getOperand(0), // Chain
25763 SrcNew,
25764 N->getOperand(3), // Pg
25765 Base,
25766 Offset,
25767 InputVT};
25768
25769 return DAG.getNode(Opcode, DL, VTs, Ops);
25770}
25771
25773 unsigned Opcode,
25774 bool OnlyPackedOffsets = true) {
25775 const EVT RetVT = N->getValueType(0);
25776 assert(RetVT.isScalableVector() &&
25777 "Gather loads are only possible for SVE vectors");
25778
25779 SDLoc DL(N);
25780
25781 // Make sure that the loaded data will fit into an SVE register
25783 return SDValue();
25784
25785 // Depending on the addressing mode, this is either a pointer or a vector of
25786 // pointers (that fits into one register)
25787 SDValue Base = N->getOperand(3);
25788 // Depending on the addressing mode, this is either a single offset or a
25789 // vector of offsets (that fits into one register)
25790 SDValue Offset = N->getOperand(4);
25791
25792 // For "scalar + vector of indices", scale the indices to obtain unscaled
25793 // offsets. This applies to non-temporal and quadword gathers, which do not
25794 // have an addressing mode with scaled offset.
25797 RetVT.getScalarSizeInBits());
25799 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
25801 RetVT.getScalarSizeInBits());
25803 }
25804
25805 // In the case of non-temporal gather loads and quadword gather loads there's
25806 // only one addressing mode : "vector + scalar", e.g.
25807 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25808 // Since we do have intrinsics that allow the arguments to be in a different
25809 // order, we may need to swap them to match the spec.
25810 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
25811 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
25812 Offset.getValueType().isVector())
25814
25815 // GLD{FF}1_IMM requires that the offset is an immediate that is:
25816 // * a multiple of #SizeInBytes,
25817 // * in the range [0, 31 x #SizeInBytes],
25818 // where #SizeInBytes is the size in bytes of the loaded items. For
25819 // immediates outside that range and non-immediate scalar offsets use
25820 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
25821 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
25824 RetVT.getScalarSizeInBits() / 8)) {
25825 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25826 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
25829 else
25830 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
25833
25835 }
25836 }
25837
25838 auto &TLI = DAG.getTargetLoweringInfo();
25839 if (!TLI.isTypeLegal(Base.getValueType()))
25840 return SDValue();
25841
25842 // Some gather load variants allow unpacked offsets, but only as nxv2i32
25843 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25844 // nxv2i64. Legalize accordingly.
25845 if (!OnlyPackedOffsets &&
25846 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25847 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
25848
25849 // Return value type that is representable in hardware
25850 EVT HwRetVt = getSVEContainerType(RetVT);
25851
25852 // Keep the original output value type around - this is needed to be able to
25853 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
25854 // values we want the integer equivalent, so just use HwRetVT.
25855 SDValue OutVT = DAG.getValueType(RetVT);
25856 if (RetVT.isFloatingPoint())
25857 OutVT = DAG.getValueType(HwRetVt);
25858
25859 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
25860 SDValue Ops[] = {N->getOperand(0), // Chain
25861 N->getOperand(2), // Pg
25862 Base, Offset, OutVT};
25863
25864 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
25865 SDValue LoadChain = SDValue(Load.getNode(), 1);
25866
25867 if (RetVT.isInteger() && (RetVT != HwRetVt))
25868 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
25869
25870 // If the original return value was FP, bitcast accordingly. Doing it here
25871 // means that we can avoid adding TableGen patterns for FPs.
25872 if (RetVT.isFloatingPoint())
25873 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
25874
25875 return DAG.getMergeValues({Load, LoadChain}, DL);
25876}
25877
25878static SDValue
25880 SelectionDAG &DAG) {
25881 SDLoc DL(N);
25882 SDValue Src = N->getOperand(0);
25883 unsigned Opc = Src->getOpcode();
25884
25885 // Sign extend of an unsigned unpack -> signed unpack
25886 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
25887
25888 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
25890
25891 // Push the sign extend to the operand of the unpack
25892 // This is necessary where, for example, the operand of the unpack
25893 // is another unpack:
25894 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
25895 // ->
25896 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
25897 // ->
25898 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
25899 SDValue ExtOp = Src->getOperand(0);
25900 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
25901 EVT EltTy = VT.getVectorElementType();
25902 (void)EltTy;
25903
25904 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
25905 "Sign extending from an invalid type");
25906
25907 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
25908
25910 ExtOp, DAG.getValueType(ExtVT));
25911
25912 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
25913 }
25914
25915 if (DCI.isBeforeLegalizeOps())
25916 return SDValue();
25917
25919 return SDValue();
25920
25921 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
25922 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
25923 unsigned NewOpc;
25924 unsigned MemVTOpNum = 4;
25925 switch (Opc) {
25928 MemVTOpNum = 3;
25929 break;
25932 MemVTOpNum = 3;
25933 break;
25936 MemVTOpNum = 3;
25937 break;
25940 break;
25943 break;
25946 break;
25949 break;
25952 break;
25955 break;
25958 break;
25961 break;
25964 break;
25967 break;
25970 break;
25973 break;
25976 break;
25979 break;
25982 break;
25983 default:
25984 return SDValue();
25985 }
25986
25987 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
25988 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
25989
25990 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
25991 return SDValue();
25992
25993 EVT DstVT = N->getValueType(0);
25994 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
25995
25997 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
25998 Ops.push_back(Src->getOperand(I));
25999
26000 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
26001 DCI.CombineTo(N, ExtLoad);
26002 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
26003
26004 // Return N so it doesn't get rechecked
26005 return SDValue(N, 0);
26006}
26007
26008/// Legalize the gather prefetch (scalar + vector addressing mode) when the
26009/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
26010/// != nxv2i32) do not need legalization.
26012 const unsigned OffsetPos = 4;
26013 SDValue Offset = N->getOperand(OffsetPos);
26014
26015 // Not an unpacked vector, bail out.
26016 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
26017 return SDValue();
26018
26019 // Extend the unpacked offset vector to 64-bit lanes.
26020 SDLoc DL(N);
26021 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
26022 SmallVector<SDValue, 5> Ops(N->ops());
26023 // Replace the offset operand with the 64-bit one.
26024 Ops[OffsetPos] = Offset;
26025
26026 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
26027}
26028
26029/// Combines a node carrying the intrinsic
26030/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
26031/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
26032/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
26033/// sve gather prefetch instruction with vector plus immediate addressing mode.
26035 unsigned ScalarSizeInBytes) {
26036 const unsigned ImmPos = 4, OffsetPos = 3;
26037 // No need to combine the node if the immediate is valid...
26038 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
26039 return SDValue();
26040
26041 // ...otherwise swap the offset base with the offset...
26042 SmallVector<SDValue, 5> Ops(N->ops());
26043 std::swap(Ops[ImmPos], Ops[OffsetPos]);
26044 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
26045 // `aarch64_sve_prfb_gather_uxtw_index`.
26046 SDLoc DL(N);
26047 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
26048 MVT::i64);
26049
26050 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
26051}
26052
26053// Return true if the vector operation can guarantee only the first lane of its
26054// result contains data, with all bits in other lanes set to zero.
26056 switch (Op.getOpcode()) {
26057 default:
26058 return false;
26074 return true;
26075 }
26076}
26077
26079 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
26080 SDValue InsertVec = N->getOperand(0);
26081 SDValue InsertElt = N->getOperand(1);
26082 SDValue InsertIdx = N->getOperand(2);
26083
26084 // We only care about inserts into the first element...
26085 if (!isNullConstant(InsertIdx))
26086 return SDValue();
26087 // ...of a zero'd vector...
26089 return SDValue();
26090 // ...where the inserted data was previously extracted...
26091 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26092 return SDValue();
26093
26094 SDValue ExtractVec = InsertElt.getOperand(0);
26095 SDValue ExtractIdx = InsertElt.getOperand(1);
26096
26097 // ...from the first element of a vector.
26098 if (!isNullConstant(ExtractIdx))
26099 return SDValue();
26100
26101 // If we get here we are effectively trying to zero lanes 1-N of a vector.
26102
26103 // Ensure there's no type conversion going on.
26104 if (N->getValueType(0) != ExtractVec.getValueType())
26105 return SDValue();
26106
26107 if (!isLanes1toNKnownZero(ExtractVec))
26108 return SDValue();
26109
26110 // The explicit zeroing is redundant.
26111 return ExtractVec;
26112}
26113
26114static SDValue
26117 return Res;
26118
26119 return performPostLD1Combine(N, DCI, true);
26120}
26121
26124 const AArch64Subtarget *Subtarget) {
26125 SDValue N0 = N->getOperand(0);
26126 EVT VT = N->getValueType(0);
26127
26128 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
26129 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
26130 return SDValue();
26131
26132 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
26133 EVT EltVT = VT.getVectorElementType();
26134 return EltVT == MVT::f32 || EltVT == MVT::f64;
26135 };
26136
26137 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
26138 // We purposefully don't care about legality of the nodes here as we know
26139 // they can be split down into something legal.
26140 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
26141 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
26142 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
26143 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
26144 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
26145 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
26146 LN0->getChain(), LN0->getBasePtr(),
26147 N0.getValueType(), LN0->getMemOperand());
26148 DCI.CombineTo(N, ExtLoad);
26149 DCI.CombineTo(
26150 N0.getNode(),
26151 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
26152 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
26153 ExtLoad.getValue(1));
26154 return SDValue(N, 0); // Return N so it doesn't get rechecked!
26155 }
26156
26157 return SDValue();
26158}
26159
26161 const AArch64Subtarget *Subtarget) {
26162 EVT VT = N->getValueType(0);
26163
26164 // Don't expand for NEON, SVE2 or SME
26165 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
26166 return SDValue();
26167
26168 SDLoc DL(N);
26169
26170 SDValue Mask = N->getOperand(0);
26171 SDValue In1 = N->getOperand(1);
26172 SDValue In2 = N->getOperand(2);
26173
26174 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
26175 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
26176 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
26177 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
26178}
26179
26181 EVT VT = N->getValueType(0);
26182
26183 SDValue Insert = N->getOperand(0);
26184 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
26185 return SDValue();
26186
26187 if (!Insert.getOperand(0).isUndef())
26188 return SDValue();
26189
26190 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
26191 uint64_t IdxDupLane = N->getConstantOperandVal(1);
26192 if (IdxInsert != 0 || IdxDupLane != 0)
26193 return SDValue();
26194
26195 SDValue Bitcast = Insert.getOperand(1);
26196 if (Bitcast.getOpcode() != ISD::BITCAST)
26197 return SDValue();
26198
26199 SDValue Subvec = Bitcast.getOperand(0);
26200 EVT SubvecVT = Subvec.getValueType();
26201 if (!SubvecVT.is128BitVector())
26202 return SDValue();
26203 EVT NewSubvecVT =
26205
26206 SDLoc DL(N);
26207 SDValue NewInsert =
26208 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
26209 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
26210 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
26211 NewInsert, N->getOperand(1));
26212 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
26213}
26214
26215// Try to combine mull with uzp1.
26218 SelectionDAG &DAG) {
26219 if (DCI.isBeforeLegalizeOps())
26220 return SDValue();
26221
26222 SDValue LHS = N->getOperand(0);
26223 SDValue RHS = N->getOperand(1);
26224
26225 SDValue ExtractHigh;
26226 SDValue ExtractLow;
26227 SDValue TruncHigh;
26228 SDValue TruncLow;
26229 SDLoc DL(N);
26230
26231 // Check the operands are trunc and extract_high.
26233 RHS.getOpcode() == ISD::TRUNCATE) {
26234 TruncHigh = RHS;
26235 if (LHS.getOpcode() == ISD::BITCAST)
26236 ExtractHigh = LHS.getOperand(0);
26237 else
26238 ExtractHigh = LHS;
26240 LHS.getOpcode() == ISD::TRUNCATE) {
26241 TruncHigh = LHS;
26242 if (RHS.getOpcode() == ISD::BITCAST)
26243 ExtractHigh = RHS.getOperand(0);
26244 else
26245 ExtractHigh = RHS;
26246 } else
26247 return SDValue();
26248
26249 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26250 // with uzp1.
26251 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26252 SDValue TruncHighOp = TruncHigh.getOperand(0);
26253 EVT TruncHighOpVT = TruncHighOp.getValueType();
26254 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
26255 DAG.isSplatValue(TruncHighOp, false))
26256 return SDValue();
26257
26258 // Check there is other extract_high with same source vector.
26259 // For example,
26260 //
26261 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
26262 // t12: v4i16 = truncate t11
26263 // t31: v4i32 = AArch64ISD::SMULL t18, t12
26264 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
26265 // t16: v4i16 = truncate t15
26266 // t30: v4i32 = AArch64ISD::SMULL t23, t1
26267 //
26268 // This dagcombine assumes the two extract_high uses same source vector in
26269 // order to detect the pair of the mull. If they have different source vector,
26270 // this code will not work.
26271 // TODO: Should also try to look through a bitcast.
26272 bool HasFoundMULLow = true;
26273 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
26274 if (ExtractHighSrcVec->use_size() != 2)
26275 HasFoundMULLow = false;
26276
26277 // Find ExtractLow.
26278 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
26279 if (User == ExtractHigh.getNode())
26280 continue;
26281
26282 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26284 HasFoundMULLow = false;
26285 break;
26286 }
26287
26288 ExtractLow.setNode(User);
26289 }
26290
26291 if (!ExtractLow || !ExtractLow->hasOneUse())
26292 HasFoundMULLow = false;
26293
26294 // Check ExtractLow's user.
26295 if (HasFoundMULLow) {
26296 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
26297 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
26298 HasFoundMULLow = false;
26299 } else {
26300 if (ExtractLowUser->getOperand(0) == ExtractLow) {
26301 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
26302 TruncLow = ExtractLowUser->getOperand(1);
26303 else
26304 HasFoundMULLow = false;
26305 } else {
26306 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
26307 TruncLow = ExtractLowUser->getOperand(0);
26308 else
26309 HasFoundMULLow = false;
26310 }
26311 }
26312 }
26313
26314 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26315 // with uzp1.
26316 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26317 EVT TruncHighVT = TruncHigh.getValueType();
26318 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
26319 SDValue TruncLowOp =
26320 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
26321 EVT TruncLowOpVT = TruncLowOp.getValueType();
26322 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
26323 DAG.isSplatValue(TruncLowOp, false)))
26324 return SDValue();
26325
26326 // Create uzp1, extract_high and extract_low.
26327 if (TruncHighOpVT != UZP1VT)
26328 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
26329 if (TruncLowOpVT != UZP1VT)
26330 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
26331
26332 SDValue UZP1 =
26333 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
26334 SDValue HighIdxCst =
26335 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
26336 SDValue NewTruncHigh =
26337 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
26338 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
26339
26340 if (HasFoundMULLow) {
26341 EVT TruncLowVT = TruncLow.getValueType();
26342 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
26343 UZP1, ExtractLow.getOperand(1));
26344 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
26345 }
26346
26347 return SDValue(N, 0);
26348}
26349
26352 SelectionDAG &DAG) {
26353 if (SDValue Val =
26355 return Val;
26356
26357 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
26358 return Val;
26359
26360 return SDValue();
26361}
26362
26363static SDValue
26365 SelectionDAG &DAG) {
26366 // Let's do below transform.
26367 //
26368 // t34: v4i32 = AArch64ISD::UADDLV t2
26369 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
26370 // t7: i64 = zero_extend t35
26371 // t20: v1i64 = scalar_to_vector t7
26372 // ==>
26373 // t34: v4i32 = AArch64ISD::UADDLV t2
26374 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
26375 // t40: v1i64 = AArch64ISD::NVCAST t39
26376 if (DCI.isBeforeLegalizeOps())
26377 return SDValue();
26378
26379 EVT VT = N->getValueType(0);
26380 if (VT != MVT::v1i64)
26381 return SDValue();
26382
26383 SDValue ZEXT = N->getOperand(0);
26384 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
26385 return SDValue();
26386
26387 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
26388 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
26389 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
26390 return SDValue();
26391
26392 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
26393 return SDValue();
26394
26395 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
26396 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
26397 UADDLV.getValueType() != MVT::v4i32 ||
26398 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
26399 return SDValue();
26400
26401 // Let's generate new sequence with AArch64ISD::NVCAST.
26402 SDLoc DL(N);
26403 SDValue EXTRACT_SUBVEC =
26404 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
26405 DAG.getConstant(0, DL, MVT::i64));
26406 SDValue NVCAST =
26407 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
26408
26409 return NVCAST;
26410}
26411
26412/// If the operand is a bitwise AND with a constant RHS, and the shift has a
26413/// constant RHS and is the only use, we can pull it out of the shift, i.e.
26414///
26415/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
26416///
26417/// We prefer this canonical form to match existing isel patterns.
26420 SelectionDAG &DAG) {
26421 if (DCI.isBeforeLegalizeOps())
26422 return SDValue();
26423
26424 SDValue Op0 = N->getOperand(0);
26425 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
26426 return SDValue();
26427
26428 SDValue C1 = Op0->getOperand(1);
26429 SDValue C2 = N->getOperand(1);
26430 if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
26431 return SDValue();
26432
26433 // Might be folded into shifted op, do not lower.
26434 if (N->hasOneUse()) {
26435 unsigned UseOpc = N->user_begin()->getOpcode();
26436 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
26437 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
26438 return SDValue();
26439 }
26440
26441 SDLoc DL(N);
26442 EVT VT = N->getValueType(0);
26443
26444 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
26445 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
26446 // causing infinite loop. Result may also be worse.
26447 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
26448 if (!isa<ConstantSDNode>(NewRHS))
26449 return SDValue();
26450
26451 SDValue X = Op0->getOperand(0);
26452 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
26453 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
26454}
26455
26457 DAGCombinerInfo &DCI) const {
26458 SelectionDAG &DAG = DCI.DAG;
26459 switch (N->getOpcode()) {
26460 default:
26461 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
26462 break;
26463 case ISD::VECREDUCE_AND:
26464 case ISD::VECREDUCE_OR:
26465 case ISD::VECREDUCE_XOR:
26466 return performVecReduceBitwiseCombine(N, DCI, DAG);
26467 case ISD::ADD:
26468 case ISD::SUB:
26469 return performAddSubCombine(N, DCI);
26470 case ISD::BUILD_VECTOR:
26471 return performBuildVectorCombine(N, DCI, DAG);
26472 case ISD::TRUNCATE:
26473 return performTruncateCombine(N, DAG, DCI);
26474 case AArch64ISD::ANDS:
26475 return performFlagSettingCombine(N, DCI, ISD::AND);
26476 case AArch64ISD::ADC:
26477 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
26478 return R;
26479 return foldADCToCINC(N, DAG);
26480 case AArch64ISD::SBC:
26481 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
26482 case AArch64ISD::ADCS:
26483 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
26484 return R;
26486 case AArch64ISD::SBCS:
26487 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
26488 return R;
26490 case AArch64ISD::BICi: {
26492 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
26493 APInt DemandedElts =
26494 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
26495
26497 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
26498 return SDValue();
26499
26500 break;
26501 }
26502 case ISD::XOR:
26503 return performXorCombine(N, DAG, DCI, Subtarget);
26504 case ISD::MUL:
26505 return performMulCombine(N, DAG, DCI, Subtarget);
26506 case ISD::SINT_TO_FP:
26507 case ISD::UINT_TO_FP:
26508 return performIntToFpCombine(N, DAG, DCI, Subtarget);
26509 case ISD::FP_TO_SINT:
26510 case ISD::FP_TO_UINT:
26513 return performFpToIntCombine(N, DAG, DCI, Subtarget);
26514 case ISD::OR:
26515 return performORCombine(N, DCI, Subtarget, *this);
26516 case ISD::AND:
26517 return performANDCombine(N, DCI);
26518 case ISD::FADD:
26519 return performFADDCombine(N, DCI);
26521 return performIntrinsicCombine(N, DCI, Subtarget);
26522 case ISD::ANY_EXTEND:
26523 case ISD::ZERO_EXTEND:
26524 case ISD::SIGN_EXTEND:
26525 return performExtendCombine(N, DCI, DAG);
26527 return performSignExtendInRegCombine(N, DCI, DAG);
26529 return performConcatVectorsCombine(N, DCI, DAG);
26531 return performExtractSubvectorCombine(N, DCI, DAG);
26533 return performInsertSubvectorCombine(N, DCI, DAG);
26534 case ISD::SELECT:
26535 return performSelectCombine(N, DCI);
26536 case ISD::VSELECT:
26537 return performVSelectCombine(N, DCI.DAG);
26538 case ISD::SETCC:
26539 return performSETCCCombine(N, DCI, DAG);
26540 case ISD::LOAD:
26541 return performLOADCombine(N, DCI, DAG, Subtarget);
26542 case ISD::STORE:
26543 return performSTORECombine(N, DCI, DAG, Subtarget);
26544 case ISD::MSTORE:
26545 return performMSTORECombine(N, DCI, DAG, Subtarget);
26546 case ISD::MGATHER:
26547 case ISD::MSCATTER:
26549 return performMaskedGatherScatterCombine(N, DCI, DAG);
26550 case ISD::FP_EXTEND:
26551 return performFPExtendCombine(N, DAG, DCI, Subtarget);
26552 case AArch64ISD::BRCOND:
26553 return performBRCONDCombine(N, DCI, DAG);
26554 case AArch64ISD::TBNZ:
26555 case AArch64ISD::TBZ:
26556 return performTBZCombine(N, DCI, DAG);
26557 case AArch64ISD::CSEL:
26558 return performCSELCombine(N, DCI, DAG);
26559 case AArch64ISD::DUP:
26564 return performDUPCombine(N, DCI);
26566 return performDupLane128Combine(N, DAG);
26567 case AArch64ISD::NVCAST:
26568 return performNVCASTCombine(N, DAG);
26569 case AArch64ISD::SPLICE:
26570 return performSpliceCombine(N, DAG);
26573 return performUnpackCombine(N, DAG, Subtarget);
26574 case AArch64ISD::UZP1:
26575 case AArch64ISD::UZP2:
26576 return performUzpCombine(N, DAG, Subtarget);
26578 return performSetccMergeZeroCombine(N, DCI);
26595 return performGLD1Combine(N, DAG);
26596 case AArch64ISD::VASHR:
26597 case AArch64ISD::VLSHR:
26598 return performVectorShiftCombine(N, *this, DCI);
26600 return performSunpkloCombine(N, DAG);
26601 case AArch64ISD::BSP:
26602 return performBSPExpandForSVE(N, DAG, Subtarget);
26604 return performInsertVectorEltCombine(N, DCI);
26606 return performExtractVectorEltCombine(N, DCI, Subtarget);
26607 case ISD::VECREDUCE_ADD:
26608 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
26609 case AArch64ISD::UADDV:
26610 return performUADDVCombine(N, DAG);
26611 case AArch64ISD::SMULL:
26612 case AArch64ISD::UMULL:
26613 case AArch64ISD::PMULL:
26614 return performMULLCombine(N, DCI, DAG);
26617 switch (N->getConstantOperandVal(1)) {
26618 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
26619 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
26620 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
26621 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
26622 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
26623 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
26624 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
26625 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
26626 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
26627 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
26628 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
26629 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
26630 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
26631 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
26632 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
26633 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
26635 case Intrinsic::aarch64_neon_ld2:
26636 case Intrinsic::aarch64_neon_ld3:
26637 case Intrinsic::aarch64_neon_ld4:
26638 case Intrinsic::aarch64_neon_ld1x2:
26639 case Intrinsic::aarch64_neon_ld1x3:
26640 case Intrinsic::aarch64_neon_ld1x4:
26641 case Intrinsic::aarch64_neon_ld2lane:
26642 case Intrinsic::aarch64_neon_ld3lane:
26643 case Intrinsic::aarch64_neon_ld4lane:
26644 case Intrinsic::aarch64_neon_ld2r:
26645 case Intrinsic::aarch64_neon_ld3r:
26646 case Intrinsic::aarch64_neon_ld4r:
26647 case Intrinsic::aarch64_neon_st2:
26648 case Intrinsic::aarch64_neon_st3:
26649 case Intrinsic::aarch64_neon_st4:
26650 case Intrinsic::aarch64_neon_st1x2:
26651 case Intrinsic::aarch64_neon_st1x3:
26652 case Intrinsic::aarch64_neon_st1x4:
26653 case Intrinsic::aarch64_neon_st2lane:
26654 case Intrinsic::aarch64_neon_st3lane:
26655 case Intrinsic::aarch64_neon_st4lane:
26656 return performNEONPostLDSTCombine(N, DCI, DAG);
26657 case Intrinsic::aarch64_sve_ldnt1:
26658 return performLDNT1Combine(N, DAG);
26659 case Intrinsic::aarch64_sve_ld1rq:
26660 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
26661 case Intrinsic::aarch64_sve_ld1ro:
26662 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
26663 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
26665 case Intrinsic::aarch64_sve_ldnt1_gather:
26667 case Intrinsic::aarch64_sve_ldnt1_gather_index:
26668 return performGatherLoadCombine(N, DAG,
26670 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
26672 case Intrinsic::aarch64_sve_ld1:
26674 case Intrinsic::aarch64_sve_ldnf1:
26676 case Intrinsic::aarch64_sve_ldff1:
26678 case Intrinsic::aarch64_sve_st1:
26679 return performST1Combine(N, DAG);
26680 case Intrinsic::aarch64_sve_stnt1:
26681 return performSTNT1Combine(N, DAG);
26682 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
26684 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
26686 case Intrinsic::aarch64_sve_stnt1_scatter:
26688 case Intrinsic::aarch64_sve_stnt1_scatter_index:
26690 case Intrinsic::aarch64_sve_ld1_gather:
26692 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
26693 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
26695 case Intrinsic::aarch64_sve_ld1q_gather_index:
26696 return performGatherLoadCombine(N, DAG,
26698 case Intrinsic::aarch64_sve_ld1_gather_index:
26699 return performGatherLoadCombine(N, DAG,
26701 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
26703 /*OnlyPackedOffsets=*/false);
26704 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
26706 /*OnlyPackedOffsets=*/false);
26707 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
26708 return performGatherLoadCombine(N, DAG,
26710 /*OnlyPackedOffsets=*/false);
26711 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
26712 return performGatherLoadCombine(N, DAG,
26714 /*OnlyPackedOffsets=*/false);
26715 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
26717 case Intrinsic::aarch64_sve_ldff1_gather:
26719 case Intrinsic::aarch64_sve_ldff1_gather_index:
26720 return performGatherLoadCombine(N, DAG,
26722 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
26723 return performGatherLoadCombine(N, DAG,
26725 /*OnlyPackedOffsets=*/false);
26726 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
26727 return performGatherLoadCombine(N, DAG,
26729 /*OnlyPackedOffsets=*/false);
26730 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
26731 return performGatherLoadCombine(N, DAG,
26733 /*OnlyPackedOffsets=*/false);
26734 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
26735 return performGatherLoadCombine(N, DAG,
26737 /*OnlyPackedOffsets=*/false);
26738 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
26739 return performGatherLoadCombine(N, DAG,
26741 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
26742 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
26744 case Intrinsic::aarch64_sve_st1q_scatter_index:
26746 case Intrinsic::aarch64_sve_st1_scatter:
26748 case Intrinsic::aarch64_sve_st1_scatter_index:
26750 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
26752 /*OnlyPackedOffsets=*/false);
26753 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
26755 /*OnlyPackedOffsets=*/false);
26756 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
26757 return performScatterStoreCombine(N, DAG,
26759 /*OnlyPackedOffsets=*/false);
26760 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
26761 return performScatterStoreCombine(N, DAG,
26763 /*OnlyPackedOffsets=*/false);
26764 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
26766 case Intrinsic::aarch64_rndr:
26767 case Intrinsic::aarch64_rndrrs: {
26768 unsigned IntrinsicID = N->getConstantOperandVal(1);
26769 auto Register =
26770 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
26771 : AArch64SysReg::RNDRRS);
26772 SDLoc DL(N);
26773 SDValue A = DAG.getNode(
26774 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
26775 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
26776 SDValue B = DAG.getNode(
26777 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
26778 DAG.getConstant(0, DL, MVT::i32),
26779 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
26780 return DAG.getMergeValues(
26781 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
26782 }
26783 case Intrinsic::aarch64_sme_ldr_zt:
26785 DAG.getVTList(MVT::Other), N->getOperand(0),
26786 N->getOperand(2), N->getOperand(3));
26787 case Intrinsic::aarch64_sme_str_zt:
26788 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
26789 DAG.getVTList(MVT::Other), N->getOperand(0),
26790 N->getOperand(2), N->getOperand(3));
26791 default:
26792 break;
26793 }
26794 break;
26795 case ISD::GlobalAddress:
26796 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
26797 case ISD::CTLZ:
26798 return performCTLZCombine(N, DAG, Subtarget);
26800 return performScalarToVectorCombine(N, DCI, DAG);
26801 case ISD::SHL:
26802 return performSHLCombine(N, DCI, DAG);
26803 }
26804 return SDValue();
26805}
26806
26807// Check if the return value is used as only a return value, as otherwise
26808// we can't perform a tail-call. In particular, we need to check for
26809// target ISD nodes that are returns and any other "odd" constructs
26810// that the generic analysis code won't necessarily catch.
26811bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
26812 SDValue &Chain) const {
26813 if (N->getNumValues() != 1)
26814 return false;
26815 if (!N->hasNUsesOfValue(1, 0))
26816 return false;
26817
26818 SDValue TCChain = Chain;
26819 SDNode *Copy = *N->user_begin();
26820 if (Copy->getOpcode() == ISD::CopyToReg) {
26821 // If the copy has a glue operand, we conservatively assume it isn't safe to
26822 // perform a tail call.
26823 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
26824 MVT::Glue)
26825 return false;
26826 TCChain = Copy->getOperand(0);
26827 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
26828 return false;
26829
26830 bool HasRet = false;
26831 for (SDNode *Node : Copy->users()) {
26832 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
26833 return false;
26834 HasRet = true;
26835 }
26836
26837 if (!HasRet)
26838 return false;
26839
26840 Chain = TCChain;
26841 return true;
26842}
26843
26844// Return whether the an instruction can potentially be optimized to a tail
26845// call. This will cause the optimizers to attempt to move, or duplicate,
26846// return instructions to help enable tail call optimizations for this
26847// instruction.
26848bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
26849 return CI->isTailCall();
26850}
26851
26852bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
26853 Register Offset, bool IsPre,
26854 MachineRegisterInfo &MRI) const {
26855 auto CstOffset = getIConstantVRegVal(Offset, MRI);
26856 if (!CstOffset || CstOffset->isZero())
26857 return false;
26858
26859 // All of the indexed addressing mode instructions take a signed 9 bit
26860 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
26861 // encodes the sign/indexing direction.
26862 return isInt<9>(CstOffset->getSExtValue());
26863}
26864
26865bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
26866 SDValue &Base,
26867 SDValue &Offset,
26868 SelectionDAG &DAG) const {
26869 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
26870 return false;
26871
26872 // Non-null if there is exactly one user of the loaded value (ignoring chain).
26873 SDNode *ValOnlyUser = nullptr;
26874 for (SDUse &U : N->uses()) {
26875 if (U.getResNo() == 1)
26876 continue; // Ignore chain.
26877 if (ValOnlyUser == nullptr)
26878 ValOnlyUser = U.getUser();
26879 else {
26880 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
26881 break;
26882 }
26883 }
26884
26885 auto IsUndefOrZero = [](SDValue V) {
26886 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
26887 };
26888
26889 // If the only user of the value is a scalable vector splat, it is
26890 // preferable to do a replicating load (ld1r*).
26891 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
26892 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
26893 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
26894 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
26895 return false;
26896
26897 Base = Op->getOperand(0);
26898 // All of the indexed addressing mode instructions take a signed
26899 // 9 bit immediate offset.
26900 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
26901 int64_t RHSC = RHS->getSExtValue();
26902 if (Op->getOpcode() == ISD::SUB)
26903 RHSC = -(uint64_t)RHSC;
26904 if (!isInt<9>(RHSC))
26905 return false;
26906 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
26907 // when dealing with subtraction.
26908 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
26909 return true;
26910 }
26911 return false;
26912}
26913
26914bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
26915 SDValue &Offset,
26917 SelectionDAG &DAG) const {
26918 EVT VT;
26919 SDValue Ptr;
26920 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
26921 VT = LD->getMemoryVT();
26922 Ptr = LD->getBasePtr();
26923 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
26924 VT = ST->getMemoryVT();
26925 Ptr = ST->getBasePtr();
26926 } else
26927 return false;
26928
26929 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
26930 return false;
26931 AM = ISD::PRE_INC;
26932 return true;
26933}
26934
26935bool AArch64TargetLowering::getPostIndexedAddressParts(
26937 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
26938 EVT VT;
26939 SDValue Ptr;
26940 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
26941 VT = LD->getMemoryVT();
26942 Ptr = LD->getBasePtr();
26943 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
26944 VT = ST->getMemoryVT();
26945 Ptr = ST->getBasePtr();
26946 } else
26947 return false;
26948
26949 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
26950 return false;
26951 // Post-indexing updates the base, so it's not a valid transform
26952 // if that's not the same as the load's pointer.
26953 if (Ptr != Base)
26954 return false;
26955 AM = ISD::POST_INC;
26956 return true;
26957}
26958
26961 SelectionDAG &DAG) {
26962 SDLoc DL(N);
26963 SDValue Op = N->getOperand(0);
26964 EVT VT = N->getValueType(0);
26965 [[maybe_unused]] EVT SrcVT = Op.getValueType();
26966 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
26967 "Must be bool vector.");
26968
26969 // Special handling for Clang's __builtin_convertvector. For vectors with <8
26970 // elements, it adds a vector concatenation with undef(s). If we encounter
26971 // this here, we can skip the concat.
26972 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
26973 bool AllUndef = true;
26974 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
26975 AllUndef &= Op.getOperand(I).isUndef();
26976
26977 if (AllUndef)
26978 Op = Op.getOperand(0);
26979 }
26980
26981 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
26982 if (VectorBits)
26983 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
26984}
26985
26988 SelectionDAG &DAG, EVT ExtendVT,
26989 EVT CastVT) {
26990 SDLoc DL(N);
26991 SDValue Op = N->getOperand(0);
26992 EVT VT = N->getValueType(0);
26993
26994 // Use SCALAR_TO_VECTOR for lane zero
26995 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
26996 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
26997 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
26998 Results.push_back(
26999 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
27000}
27001
27002void AArch64TargetLowering::ReplaceBITCASTResults(
27004 SDLoc DL(N);
27005 SDValue Op = N->getOperand(0);
27006 EVT VT = N->getValueType(0);
27007 EVT SrcVT = Op.getValueType();
27008
27009 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
27010 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
27011 return;
27012 }
27013
27014 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
27015 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
27016 return;
27017 }
27018
27019 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
27020 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
27021 return;
27022 }
27023
27024 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
27025 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
27026 "Expected fp->int bitcast!");
27027
27028 // Bitcasting between unpacked vector types of different element counts is
27029 // not a NOP because the live elements are laid out differently.
27030 // 01234567
27031 // e.g. nxv2i32 = XX??XX??
27032 // nxv4f16 = X?X?X?X?
27033 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
27034 return;
27035
27036 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
27037 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
27038 return;
27039 }
27040
27041 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27042 !VT.isVector())
27043 return replaceBoolVectorBitcast(N, Results, DAG);
27044
27045 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
27046 return;
27047
27048 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
27049 DAG.getUNDEF(MVT::i32), Op);
27050 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
27051 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
27052}
27053
27055 SelectionDAG &DAG,
27056 const AArch64Subtarget *Subtarget) {
27057 EVT VT = N->getValueType(0);
27058 if (!VT.is256BitVector() ||
27060 !N->getFlags().hasAllowReassociation()) ||
27061 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
27062 VT.getScalarType() == MVT::bf16)
27063 return;
27064
27065 SDValue X = N->getOperand(0);
27066 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
27067 if (!Shuf) {
27068 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
27069 X = N->getOperand(1);
27070 if (!Shuf)
27071 return;
27072 }
27073
27074 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
27075 return;
27076
27077 // Check the mask is 1,0,3,2,5,4,...
27078 ArrayRef<int> Mask = Shuf->getMask();
27079 for (int I = 0, E = Mask.size(); I < E; I++)
27080 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
27081 return;
27082
27083 SDLoc DL(N);
27084 auto LoHi = DAG.SplitVector(X, DL);
27085 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
27086 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
27087 LoHi.first, LoHi.second);
27088
27089 // Shuffle the elements back into order.
27090 SmallVector<int> NMask;
27091 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
27092 NMask.push_back(I);
27093 NMask.push_back(I);
27094 }
27095 Results.push_back(
27096 DAG.getVectorShuffle(VT, DL,
27097 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
27098 DAG.getUNDEF(LoHi.first.getValueType())),
27099 DAG.getUNDEF(VT), NMask));
27100}
27101
27104 SelectionDAG &DAG, unsigned InterOp,
27105 unsigned AcrossOp) {
27106 EVT LoVT, HiVT;
27107 SDValue Lo, Hi;
27108 SDLoc dl(N);
27109 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
27110 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
27111 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
27112 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
27113 Results.push_back(SplitVal);
27114}
27115
27116void AArch64TargetLowering::ReplaceExtractSubVectorResults(
27118 SDValue In = N->getOperand(0);
27119 EVT InVT = In.getValueType();
27120
27121 // Common code will handle these just fine.
27122 if (!InVT.isScalableVector() || !InVT.isInteger())
27123 return;
27124
27125 SDLoc DL(N);
27126 EVT VT = N->getValueType(0);
27127
27128 // The following checks bail if this is not a halving operation.
27129
27131
27132 if (InVT.getVectorElementCount() != (ResEC * 2))
27133 return;
27134
27135 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
27136 if (!CIndex)
27137 return;
27138
27139 unsigned Index = CIndex->getZExtValue();
27140 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
27141 return;
27142
27143 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
27144 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27145
27146 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
27147 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
27148}
27149
27150// Create an even/odd pair of X registers holding integer value V.
27152 SDLoc dl(V.getNode());
27153 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
27154 if (DAG.getDataLayout().isBigEndian())
27155 std::swap (VLo, VHi);
27156 SDValue RegClass =
27157 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
27158 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
27159 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
27160 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
27161 return SDValue(
27162 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
27163}
27164
27167 SelectionDAG &DAG,
27168 const AArch64Subtarget *Subtarget) {
27169 assert(N->getValueType(0) == MVT::i128 &&
27170 "AtomicCmpSwap on types less than 128 should be legal");
27171
27172 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
27173 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
27174 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
27175 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
27176 SDValue Ops[] = {
27177 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
27178 createGPRPairNode(DAG, N->getOperand(3)), // Store value
27179 N->getOperand(1), // Ptr
27180 N->getOperand(0), // Chain in
27181 };
27182
27183 unsigned Opcode;
27184 switch (MemOp->getMergedOrdering()) {
27186 Opcode = AArch64::CASPX;
27187 break;
27189 Opcode = AArch64::CASPAX;
27190 break;
27192 Opcode = AArch64::CASPLX;
27193 break;
27196 Opcode = AArch64::CASPALX;
27197 break;
27198 default:
27199 llvm_unreachable("Unexpected ordering!");
27200 }
27201
27202 MachineSDNode *CmpSwap = DAG.getMachineNode(
27203 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
27204 DAG.setNodeMemRefs(CmpSwap, {MemOp});
27205
27206 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
27207 if (DAG.getDataLayout().isBigEndian())
27208 std::swap(SubReg1, SubReg2);
27209 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
27210 SDValue(CmpSwap, 0));
27211 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
27212 SDValue(CmpSwap, 0));
27213 Results.push_back(
27214 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
27215 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
27216 return;
27217 }
27218
27219 unsigned Opcode;
27220 switch (MemOp->getMergedOrdering()) {
27222 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
27223 break;
27225 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
27226 break;
27228 Opcode = AArch64::CMP_SWAP_128_RELEASE;
27229 break;
27232 Opcode = AArch64::CMP_SWAP_128;
27233 break;
27234 default:
27235 llvm_unreachable("Unexpected ordering!");
27236 }
27237
27238 SDLoc DL(N);
27239 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
27240 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
27241 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
27242 New.first, New.second, N->getOperand(0)};
27243 SDNode *CmpSwap = DAG.getMachineNode(
27244 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
27245 Ops);
27246 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
27247
27248 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
27249 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
27250 Results.push_back(SDValue(CmpSwap, 3));
27251}
27252
27253static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
27254 AtomicOrdering Ordering) {
27255 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
27256 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
27257 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
27258 // ATOMIC_LOAD_CLR at any point.
27259 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
27260 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
27261 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
27262 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
27263
27264 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27265 // The operand will need to be XORed in a separate step.
27266 switch (Ordering) {
27268 return AArch64::LDCLRP;
27269 break;
27271 return AArch64::LDCLRPA;
27272 break;
27274 return AArch64::LDCLRPL;
27275 break;
27278 return AArch64::LDCLRPAL;
27279 break;
27280 default:
27281 llvm_unreachable("Unexpected ordering!");
27282 }
27283 }
27284
27285 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
27286 switch (Ordering) {
27288 return AArch64::LDSETP;
27289 break;
27291 return AArch64::LDSETPA;
27292 break;
27294 return AArch64::LDSETPL;
27295 break;
27298 return AArch64::LDSETPAL;
27299 break;
27300 default:
27301 llvm_unreachable("Unexpected ordering!");
27302 }
27303 }
27304
27305 if (ISDOpcode == ISD::ATOMIC_SWAP) {
27306 switch (Ordering) {
27308 return AArch64::SWPP;
27309 break;
27311 return AArch64::SWPPA;
27312 break;
27314 return AArch64::SWPPL;
27315 break;
27318 return AArch64::SWPPAL;
27319 break;
27320 default:
27321 llvm_unreachable("Unexpected ordering!");
27322 }
27323 }
27324
27325 llvm_unreachable("Unexpected ISDOpcode!");
27326}
27327
27330 SelectionDAG &DAG,
27331 const AArch64Subtarget *Subtarget) {
27332 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
27333 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
27334 // rather than the CASP instructions, because CASP has register classes for
27335 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
27336 // to present them as single operands. LSE128 instructions use the GPR64
27337 // register class (because the pair does not have to be sequential), like
27338 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
27339
27340 assert(N->getValueType(0) == MVT::i128 &&
27341 "AtomicLoadXXX on types less than 128 should be legal");
27342
27343 if (!Subtarget->hasLSE128())
27344 return;
27345
27346 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
27347 const SDValue &Chain = N->getOperand(0);
27348 const SDValue &Ptr = N->getOperand(1);
27349 const SDValue &Val128 = N->getOperand(2);
27350 std::pair<SDValue, SDValue> Val2x64 =
27351 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
27352
27353 const unsigned ISDOpcode = N->getOpcode();
27354 const unsigned MachineOpcode =
27355 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
27356
27357 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27358 SDLoc dl(Val128);
27359 Val2x64.first =
27360 DAG.getNode(ISD::XOR, dl, MVT::i64,
27361 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
27362 Val2x64.second =
27363 DAG.getNode(ISD::XOR, dl, MVT::i64,
27364 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
27365 }
27366
27367 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
27368 if (DAG.getDataLayout().isBigEndian())
27369 std::swap(Ops[0], Ops[1]);
27370
27371 MachineSDNode *AtomicInst =
27372 DAG.getMachineNode(MachineOpcode, SDLoc(N),
27373 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
27374
27375 DAG.setNodeMemRefs(AtomicInst, {MemOp});
27376
27377 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
27378 if (DAG.getDataLayout().isBigEndian())
27379 std::swap(Lo, Hi);
27380
27381 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
27382 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
27383}
27384
27385void AArch64TargetLowering::ReplaceNodeResults(
27387 switch (N->getOpcode()) {
27388 default:
27389 llvm_unreachable("Don't know how to custom expand this");
27390 case ISD::BITCAST:
27391 ReplaceBITCASTResults(N, Results, DAG);
27392 return;
27393 case ISD::VECREDUCE_ADD:
27398 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
27399 return;
27401 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
27402 Results.push_back(Res);
27403 return;
27404 case ISD::ADD:
27405 case ISD::FADD:
27406 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
27407 return;
27408
27409 case ISD::CTPOP:
27410 case ISD::PARITY:
27411 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
27412 Results.push_back(Result);
27413 return;
27414 case AArch64ISD::SADDV:
27416 return;
27417 case AArch64ISD::UADDV:
27419 return;
27420 case AArch64ISD::SMINV:
27422 return;
27423 case AArch64ISD::UMINV:
27425 return;
27426 case AArch64ISD::SMAXV:
27428 return;
27429 case AArch64ISD::UMAXV:
27431 return;
27432 case ISD::MULHS:
27434 Results.push_back(
27435 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
27436 return;
27437 case ISD::MULHU:
27439 Results.push_back(
27440 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
27441 return;
27442 case ISD::FP_TO_UINT:
27443 case ISD::FP_TO_SINT:
27446 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
27447 // Let normal code take care of it by not adding anything to Results.
27448 return;
27450 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
27451 return;
27453 assert(N->getValueType(0) != MVT::i128 &&
27454 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
27455 break;
27458 case ISD::ATOMIC_SWAP: {
27459 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
27460 "Expected 128-bit atomicrmw.");
27461 // These need custom type legalisation so we go directly to instruction.
27462 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
27463 return;
27464 }
27465 case ISD::ADDRSPACECAST: {
27466 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
27467 Results.push_back(V);
27468 return;
27469 }
27470 case ISD::ATOMIC_LOAD:
27471 case ISD::LOAD: {
27472 MemSDNode *LoadNode = cast<MemSDNode>(N);
27473 EVT MemVT = LoadNode->getMemoryVT();
27474 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
27475 // targets.
27476 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
27477 MemVT.getSizeInBits() == 256u &&
27478 (MemVT.getScalarSizeInBits() == 8u ||
27479 MemVT.getScalarSizeInBits() == 16u ||
27480 MemVT.getScalarSizeInBits() == 32u ||
27481 MemVT.getScalarSizeInBits() == 64u)) {
27482
27485 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
27486 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
27487 MVT::Other}),
27488 {LoadNode->getChain(), LoadNode->getBasePtr()},
27489 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
27490
27491 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
27492 Result.getValue(0), Result.getValue(1));
27493 Results.append({Pair, Result.getValue(2) /* Chain */});
27494 return;
27495 }
27496
27497 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
27498 LoadNode->getMemoryVT() != MVT::i128) {
27499 // Non-volatile or atomic loads are optimized later in AArch64's load/store
27500 // optimizer.
27501 return;
27502 }
27503
27504 if (SDValue(N, 0).getValueType() == MVT::i128) {
27505 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
27506 bool isLoadAcquire =
27508 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
27509
27510 if (isLoadAcquire)
27511 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
27512
27514 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
27515 {LoadNode->getChain(), LoadNode->getBasePtr()},
27516 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
27517
27518 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
27519
27520 SDValue Pair =
27521 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
27522 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
27523 Results.append({Pair, Result.getValue(2) /* Chain */});
27524 }
27525 return;
27526 }
27528 ReplaceExtractSubVectorResults(N, Results, DAG);
27529 return;
27532 // Custom lowering has been requested for INSERT_SUBVECTOR and
27533 // CONCAT_VECTORS -- but delegate to common code for result type
27534 // legalisation
27535 return;
27537 EVT VT = N->getValueType(0);
27538
27539 Intrinsic::ID IntID =
27540 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
27541 switch (IntID) {
27542 default:
27543 return;
27544 case Intrinsic::aarch64_sve_clasta_n: {
27545 assert((VT == MVT::i8 || VT == MVT::i16) &&
27546 "custom lowering for unexpected type");
27547 SDLoc DL(N);
27548 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
27549 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
27550 N->getOperand(1), Op2, N->getOperand(3));
27551 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27552 return;
27553 }
27554 case Intrinsic::aarch64_sve_clastb_n: {
27555 assert((VT == MVT::i8 || VT == MVT::i16) &&
27556 "custom lowering for unexpected type");
27557 SDLoc DL(N);
27558 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
27559 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
27560 N->getOperand(1), Op2, N->getOperand(3));
27561 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27562 return;
27563 }
27564 case Intrinsic::aarch64_sve_lasta: {
27565 assert((VT == MVT::i8 || VT == MVT::i16) &&
27566 "custom lowering for unexpected type");
27567 SDLoc DL(N);
27568 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
27569 N->getOperand(1), N->getOperand(2));
27570 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27571 return;
27572 }
27573 case Intrinsic::aarch64_sve_lastb: {
27574 assert((VT == MVT::i8 || VT == MVT::i16) &&
27575 "custom lowering for unexpected type");
27576 SDLoc DL(N);
27577 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
27578 N->getOperand(1), N->getOperand(2));
27579 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27580 return;
27581 }
27582 case Intrinsic::aarch64_sme_in_streaming_mode: {
27583 SDLoc DL(N);
27584 SDValue Chain = DAG.getEntryNode();
27585 SDValue RuntimePStateSM =
27586 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
27587 Results.push_back(
27588 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
27589 return;
27590 }
27591 case Intrinsic::experimental_vector_match:
27592 case Intrinsic::get_active_lane_mask: {
27593 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
27594 return;
27595
27596 // NOTE: Only trivial type promotion is supported.
27597 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
27598 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
27599 return;
27600
27601 SDLoc DL(N);
27602 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
27603 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27604 return;
27605 }
27606 }
27607 }
27608 case ISD::READ_REGISTER: {
27609 SDLoc DL(N);
27610 assert(N->getValueType(0) == MVT::i128 &&
27611 "READ_REGISTER custom lowering is only for 128-bit sysregs");
27612 SDValue Chain = N->getOperand(0);
27613 SDValue SysRegName = N->getOperand(1);
27614
27615 SDValue Result = DAG.getNode(
27616 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
27617 Chain, SysRegName);
27618
27619 // Sysregs are not endian. Result.getValue(0) always contains the lower half
27620 // of the 128-bit System Register value.
27621 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
27622 Result.getValue(0), Result.getValue(1));
27623 Results.push_back(Pair);
27624 Results.push_back(Result.getValue(2)); // Chain
27625 return;
27626 }
27627 }
27628}
27629
27631 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
27633 return true;
27634}
27635
27636unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
27637 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
27638 // reciprocal if there are three or more FDIVs.
27639 return 3;
27640}
27641
27644 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
27645 // v4i16, v2i32 instead of to promote.
27646 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
27647 VT == MVT::v1f32)
27648 return TypeWidenVector;
27649
27651}
27652
27653// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
27654// provided the address is 16-byte aligned.
27656 if (!Subtarget->hasLSE2())
27657 return false;
27658
27659 if (auto LI = dyn_cast<LoadInst>(I))
27660 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27661 LI->getAlign() >= Align(16);
27662
27663 if (auto SI = dyn_cast<StoreInst>(I))
27664 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27665 SI->getAlign() >= Align(16);
27666
27667 return false;
27668}
27669
27671 if (!Subtarget->hasLSE128())
27672 return false;
27673
27674 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
27675 // will clobber the two registers.
27676 if (const auto *SI = dyn_cast<StoreInst>(I))
27677 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27678 SI->getAlign() >= Align(16) &&
27679 (SI->getOrdering() == AtomicOrdering::Release ||
27680 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
27681
27682 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
27683 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27684 RMW->getAlign() >= Align(16) &&
27685 (RMW->getOperation() == AtomicRMWInst::Xchg ||
27686 RMW->getOperation() == AtomicRMWInst::And ||
27687 RMW->getOperation() == AtomicRMWInst::Or);
27688
27689 return false;
27690}
27691
27693 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
27694 return false;
27695
27696 if (auto LI = dyn_cast<LoadInst>(I))
27697 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27698 LI->getAlign() >= Align(16) &&
27699 LI->getOrdering() == AtomicOrdering::Acquire;
27700
27701 if (auto SI = dyn_cast<StoreInst>(I))
27702 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27703 SI->getAlign() >= Align(16) &&
27704 SI->getOrdering() == AtomicOrdering::Release;
27705
27706 return false;
27707}
27708
27710 const Instruction *I) const {
27712 return false;
27714 return false;
27716 return true;
27717 return false;
27718}
27719
27721 const Instruction *I) const {
27722 // Store-Release instructions only provide seq_cst guarantees when paired with
27723 // Load-Acquire instructions. MSVC CRT does not use these instructions to
27724 // implement seq_cst loads and stores, so we need additional explicit fences
27725 // after memory writes.
27726 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27727 return false;
27728
27729 switch (I->getOpcode()) {
27730 default:
27731 return false;
27732 case Instruction::AtomicCmpXchg:
27733 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
27735 case Instruction::AtomicRMW:
27736 return cast<AtomicRMWInst>(I)->getOrdering() ==
27738 case Instruction::Store:
27739 return cast<StoreInst>(I)->getOrdering() ==
27741 }
27742}
27743
27744// Loads and stores less than 128-bits are already atomic; ones above that
27745// are doomed anyway, so defer to the default libcall and blame the OS when
27746// things go wrong.
27749 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
27750 if (Size != 128)
27752 if (isOpSuitableForRCPC3(SI))
27754 if (isOpSuitableForLSE128(SI))
27756 if (isOpSuitableForLDPSTP(SI))
27759}
27760
27761// Loads and stores less than 128-bits are already atomic; ones above that
27762// are doomed anyway, so defer to the default libcall and blame the OS when
27763// things go wrong.
27766 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
27767
27768 if (Size != 128)
27770 if (isOpSuitableForRCPC3(LI))
27772 // No LSE128 loads
27773 if (isOpSuitableForLDPSTP(LI))
27775
27776 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27777 // implement atomicrmw without spilling. If the target address is also on the
27778 // stack and close enough to the spill slot, this can lead to a situation
27779 // where the monitor always gets cleared and the atomic operation can never
27780 // succeed. So at -O0 lower this operation to a CAS loop.
27781 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
27783
27784 // Using CAS for an atomic load has a better chance of succeeding under high
27785 // contention situations. So use it if available.
27786 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
27788}
27789
27790// Return true if the atomic operation expansion will lower to use a library
27791// call, and is thus ineligible to use an LLSC expansion.
27792static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
27793 const AtomicRMWInst *RMW) {
27794 if (!RMW->isFloatingPointOperation())
27795 return false;
27796 switch (RMW->getType()->getScalarType()->getTypeID()) {
27797 case Type::FloatTyID:
27798 case Type::DoubleTyID:
27799 case Type::HalfTyID:
27800 case Type::BFloatTyID:
27801 // Will use soft float
27802 return !Subtarget.hasFPARMv8();
27803 default:
27804 // fp128 will emit library calls.
27805 return true;
27806 }
27807
27808 llvm_unreachable("covered type switch");
27809}
27810
27811// The "default" for integer RMW operations is to expand to an LL/SC loop.
27812// However, with the LSE instructions (or outline-atomics mode, which provides
27813// library routines in place of the LSE-instructions), we can directly emit many
27814// operations instead.
27817 Type *Ty = AI->getType();
27818 unsigned Size = Ty->getPrimitiveSizeInBits();
27819 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
27820
27821 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
27825 if (CanUseLSE128)
27827
27828 // Nand is not supported in LSE.
27829 // Leave 128 bits to LLSC or CmpXChg.
27830 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
27831 !AI->isFloatingPointOperation()) {
27832 if (Subtarget->hasLSE())
27834 if (Subtarget->outlineAtomics()) {
27835 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
27836 // Don't outline them unless
27837 // (1) high level <atomic> support approved:
27838 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
27839 // (2) low level libgcc and compiler-rt support implemented by:
27840 // min/max outline atomics helpers
27841 if (AI->getOperation() != AtomicRMWInst::Min &&
27846 }
27847 }
27848 }
27849
27850 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27851 // implement atomicrmw without spilling. If the target address is also on the
27852 // stack and close enough to the spill slot, this can lead to a situation
27853 // where the monitor always gets cleared and the atomic operation can never
27854 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
27855 // we have a single CAS instruction that can replace the loop.
27857 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
27859
27861}
27862
27865 AtomicCmpXchgInst *AI) const {
27866 // If subtarget has LSE, leave cmpxchg intact for codegen.
27867 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
27869 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27870 // implement cmpxchg without spilling. If the address being exchanged is also
27871 // on the stack and close enough to the spill slot, this can lead to a
27872 // situation where the monitor always gets cleared and the atomic operation
27873 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
27874 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
27876
27877 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
27878 // it.
27880 if (Size > 64)
27882
27884}
27885
27887 Type *ValueTy, Value *Addr,
27888 AtomicOrdering Ord) const {
27889 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27890 bool IsAcquire = isAcquireOrStronger(Ord);
27891
27892 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
27893 // intrinsic must return {i64, i64} and we have to recombine them into a
27894 // single i128 here.
27895 if (ValueTy->getPrimitiveSizeInBits() == 128) {
27897 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
27898
27899 Value *LoHi =
27900 Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi");
27901
27902 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
27903 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
27904
27905 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
27906 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
27907 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
27908
27909 Value *Or = Builder.CreateOr(
27910 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
27911 return Builder.CreateBitCast(Or, ValueTy);
27912 }
27913
27914 Type *Tys[] = { Addr->getType() };
27916 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
27917
27918 const DataLayout &DL = M->getDataLayout();
27919 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
27920 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
27921 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
27922 Attribute::ElementType, IntEltTy));
27923 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
27924
27925 return Builder.CreateBitCast(Trunc, ValueTy);
27926}
27927
27929 IRBuilderBase &Builder) const {
27930 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
27931}
27932
27934 Value *Val, Value *Addr,
27935 AtomicOrdering Ord) const {
27936 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27937 bool IsRelease = isReleaseOrStronger(Ord);
27938
27939 // Since the intrinsics must have legal type, the i128 intrinsics take two
27940 // parameters: "i64, i64". We must marshal Val into the appropriate form
27941 // before the call.
27942 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
27944 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
27946 Type *Int64Ty = Type::getInt64Ty(M->getContext());
27947 Type *Int128Ty = Type::getInt128Ty(M->getContext());
27948
27949 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
27950
27951 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
27952 Value *Hi =
27953 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
27954 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
27955 }
27956
27958 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
27959 Type *Tys[] = { Addr->getType() };
27961
27962 const DataLayout &DL = M->getDataLayout();
27963 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
27964 Val = Builder.CreateBitCast(Val, IntValTy);
27965
27966 CallInst *CI = Builder.CreateCall(
27967 Stxr, {Builder.CreateZExtOrBitCast(
27968 Val, Stxr->getFunctionType()->getParamType(0)),
27969 Addr});
27970 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
27971 Attribute::ElementType, Val->getType()));
27972 return CI;
27973}
27974
27976 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
27977 const DataLayout &DL) const {
27978 if (!Ty->isArrayTy()) {
27979 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
27980 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
27981 }
27982
27983 // All non aggregate members of the type must have the same type
27984 SmallVector<EVT> ValueVTs;
27985 ComputeValueVTs(*this, DL, Ty, ValueVTs);
27986 return all_equal(ValueVTs);
27987}
27988
27989bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
27990 EVT) const {
27991 return false;
27992}
27993
27994static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
27995 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
27996 Function *ThreadPointerFunc =
27997 Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer);
27998 return IRB.CreatePointerCast(
27999 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
28000 Offset),
28001 IRB.getPtrTy(0));
28002}
28003
28005 // Android provides a fixed TLS slot for the stack cookie. See the definition
28006 // of TLS_SLOT_STACK_GUARD in
28007 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
28008 if (Subtarget->isTargetAndroid())
28009 return UseTlsOffset(IRB, 0x28);
28010
28011 // Fuchsia is similar.
28012 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
28013 if (Subtarget->isTargetFuchsia())
28014 return UseTlsOffset(IRB, -0x10);
28015
28017}
28018
28020 // MSVC CRT provides functionalities for stack protection.
28021 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
28022 // MSVC CRT has a global variable holding security cookie.
28023 M.getOrInsertGlobal("__security_cookie",
28024 PointerType::getUnqual(M.getContext()));
28025
28026 // MSVC CRT has a function to validate security cookie.
28027 FunctionCallee SecurityCheckCookie =
28028 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
28029 Type::getVoidTy(M.getContext()),
28030 PointerType::getUnqual(M.getContext()));
28031 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
28032 F->setCallingConv(CallingConv::Win64);
28033 F->addParamAttr(0, Attribute::AttrKind::InReg);
28034 }
28035 return;
28036 }
28038}
28039
28041 // MSVC CRT has a global variable holding security cookie.
28042 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28043 return M.getGlobalVariable("__security_cookie");
28045}
28046
28048 // MSVC CRT has a function to validate security cookie.
28049 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28050 return M.getFunction(Subtarget->getSecurityCheckCookieName());
28052}
28053
28054Value *
28056 // Android provides a fixed TLS slot for the SafeStack pointer. See the
28057 // definition of TLS_SLOT_SAFESTACK in
28058 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
28059 if (Subtarget->isTargetAndroid())
28060 return UseTlsOffset(IRB, 0x48);
28061
28062 // Fuchsia is similar.
28063 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
28064 if (Subtarget->isTargetFuchsia())
28065 return UseTlsOffset(IRB, -0x8);
28066
28068}
28069
28070/// If a physical register, this returns the register that receives the
28071/// exception address on entry to an EH pad.
28073 const Constant *PersonalityFn) const {
28074 // FIXME: This is a guess. Has this been defined yet?
28075 return AArch64::X0;
28076}
28077
28078/// If a physical register, this returns the register that receives the
28079/// exception typeid on entry to a landing pad.
28081 const Constant *PersonalityFn) const {
28082 // FIXME: This is a guess. Has this been defined yet?
28083 return AArch64::X1;
28084}
28085
28087 const Instruction &AndI) const {
28088 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
28089 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
28090 // may be beneficial to sink in other cases, but we would have to check that
28091 // the cmp would not get folded into the br to form a cbz for these to be
28092 // beneficial.
28093 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
28094 if (!Mask)
28095 return false;
28096 return Mask->getValue().isPowerOf2();
28097}
28098
28102 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
28103 SelectionDAG &DAG) const {
28104 // Does baseline recommend not to perform the fold by default?
28106 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
28107 return false;
28108 // Else, if this is a vector shift, prefer 'shl'.
28109 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
28110}
28111
28114 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
28116 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
28119 ExpansionFactor);
28120}
28121
28123 // Update IsSplitCSR in AArch64unctionInfo.
28124 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
28125 AFI->setIsSplitCSR(true);
28126}
28127
28129 MachineBasicBlock *Entry,
28130 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
28131 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
28132 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
28133 if (!IStart)
28134 return;
28135
28136 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28137 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28138 MachineBasicBlock::iterator MBBI = Entry->begin();
28139 for (const MCPhysReg *I = IStart; *I; ++I) {
28140 const TargetRegisterClass *RC = nullptr;
28141 if (AArch64::GPR64RegClass.contains(*I))
28142 RC = &AArch64::GPR64RegClass;
28143 else if (AArch64::FPR64RegClass.contains(*I))
28144 RC = &AArch64::FPR64RegClass;
28145 else
28146 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
28147
28148 Register NewVR = MRI->createVirtualRegister(RC);
28149 // Create copy from CSR to a virtual register.
28150 // FIXME: this currently does not emit CFI pseudo-instructions, it works
28151 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
28152 // nounwind. If we want to generalize this later, we may need to emit
28153 // CFI pseudo-instructions.
28154 assert(Entry->getParent()->getFunction().hasFnAttribute(
28155 Attribute::NoUnwind) &&
28156 "Function should be nounwind in insertCopiesSplitCSR!");
28157 Entry->addLiveIn(*I);
28158 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
28159 .addReg(*I);
28160
28161 // Insert the copy-back instructions right before the terminator.
28162 for (auto *Exit : Exits)
28163 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
28164 TII->get(TargetOpcode::COPY), *I)
28165 .addReg(NewVR);
28166 }
28167}
28168
28170 // Integer division on AArch64 is expensive. However, when aggressively
28171 // optimizing for code size, we prefer to use a div instruction, as it is
28172 // usually smaller than the alternative sequence.
28173 // The exception to this is vector division. Since AArch64 doesn't have vector
28174 // integer division, leaving the division as-is is a loss even in terms of
28175 // size, because it will have to be scalarized, while the alternative code
28176 // sequence can be performed in vector form.
28177 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
28178 return OptSize && !VT.isVector();
28179}
28180
28182 const MachineFunction &MF) const {
28183 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
28184 // In future, we could allow this when SVE is available, but currently,
28185 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
28186 // the general lowering may introduce stack spills/reloads).
28187 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
28188 return false;
28189
28190 // Do not merge to float value size (128 bytes) if no implicit float attribute
28191 // is set.
28192 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
28193 return !NoFloat || MemVT.getSizeInBits() <= 64;
28194}
28195
28197 // We want inc-of-add for scalars and sub-of-not for vectors.
28198 return VT.isScalarInteger();
28199}
28200
28202 EVT VT) const {
28203 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
28204 // legalize.
28205 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
28206 return false;
28207 if (FPVT == MVT::v8bf16)
28208 return false;
28209 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
28210}
28211
28213 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
28214 // avoid vselect becoming bsl / unrolling.
28215 return !VT.isFixedLengthVector();
28216}
28217
28221 const TargetInstrInfo *TII) const {
28222 assert(MBBI->isCall() && MBBI->getCFIType() &&
28223 "Invalid call instruction for a KCFI check");
28224
28225 switch (MBBI->getOpcode()) {
28226 case AArch64::BLR:
28227 case AArch64::BLRNoIP:
28228 case AArch64::TCRETURNri:
28229 case AArch64::TCRETURNrix16x17:
28230 case AArch64::TCRETURNrix17:
28231 case AArch64::TCRETURNrinotx16:
28232 break;
28233 default:
28234 llvm_unreachable("Unexpected CFI call opcode");
28235 }
28236
28237 MachineOperand &Target = MBBI->getOperand(0);
28238 assert(Target.isReg() && "Invalid target operand for an indirect call");
28239 Target.setIsRenamable(false);
28240
28241 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
28242 .addReg(Target.getReg())
28243 .addImm(MBBI->getCFIType())
28244 .getInstr();
28245}
28246
28248 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
28249}
28250
28251unsigned
28253 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
28254 return getPointerTy(DL).getSizeInBits();
28255
28256 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
28257}
28258
28259void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
28260 MachineFrameInfo &MFI = MF.getFrameInfo();
28261 // If we have any vulnerable SVE stack objects then the stack protector
28262 // needs to be placed at the top of the SVE stack area, as the SVE locals
28263 // are placed above the other locals, so we allocate it as if it were a
28264 // scalable vector.
28265 // FIXME: It may be worthwhile having a specific interface for this rather
28266 // than doing it here in finalizeLowering.
28267 if (MFI.hasStackProtectorIndex()) {
28268 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
28274 break;
28275 }
28276 }
28277 }
28280}
28281
28282// Unlike X86, we let frame lowering assign offsets to all catch objects.
28284 return false;
28285}
28286
28287bool AArch64TargetLowering::shouldLocalize(
28288 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
28289 auto &MF = *MI.getMF();
28290 auto &MRI = MF.getRegInfo();
28291 auto maxUses = [](unsigned RematCost) {
28292 // A cost of 1 means remats are basically free.
28293 if (RematCost == 1)
28294 return std::numeric_limits<unsigned>::max();
28295 if (RematCost == 2)
28296 return 2U;
28297
28298 // Remat is too expensive, only sink if there's one user.
28299 if (RematCost > 2)
28300 return 1U;
28301 llvm_unreachable("Unexpected remat cost");
28302 };
28303
28304 unsigned Opc = MI.getOpcode();
28305 switch (Opc) {
28306 case TargetOpcode::G_GLOBAL_VALUE: {
28307 // On Darwin, TLS global vars get selected into function calls, which
28308 // we don't want localized, as they can get moved into the middle of a
28309 // another call sequence.
28310 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
28311 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
28312 return false;
28313 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
28314 }
28315 case TargetOpcode::G_FCONSTANT:
28316 case TargetOpcode::G_CONSTANT: {
28317 const ConstantInt *CI;
28318 unsigned AdditionalCost = 0;
28319
28320 if (Opc == TargetOpcode::G_CONSTANT)
28321 CI = MI.getOperand(1).getCImm();
28322 else {
28323 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
28324 // We try to estimate cost of 32/64b fpimms, as they'll likely be
28325 // materialized as integers.
28326 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
28327 break;
28328 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
28329 bool OptForSize =
28332 OptForSize))
28333 return true; // Constant should be cheap.
28334 CI =
28335 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
28336 // FP materialization also costs an extra move, from gpr to fpr.
28337 AdditionalCost = 1;
28338 }
28339 APInt Imm = CI->getValue();
28342 assert(Cost.isValid() && "Expected a valid imm cost");
28343
28344 unsigned RematCost = *Cost.getValue();
28345 RematCost += AdditionalCost;
28346 Register Reg = MI.getOperand(0).getReg();
28347 unsigned MaxUses = maxUses(RematCost);
28348 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
28349 if (MaxUses == std::numeric_limits<unsigned>::max())
28350 --MaxUses;
28351 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
28352 }
28353 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
28354 // localizable.
28355 case AArch64::ADRP:
28356 case AArch64::G_ADD_LOW:
28357 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
28358 case TargetOpcode::G_PTR_ADD:
28359 return true;
28360 default:
28361 break;
28362 }
28364}
28365
28367 // Fallback for scalable vectors.
28368 // Note that if EnableSVEGISel is true, we allow scalable vector types for
28369 // all instructions, regardless of whether they are actually supported.
28370 if (!EnableSVEGISel) {
28371 if (Inst.getType()->isScalableTy()) {
28372 return true;
28373 }
28374
28375 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
28376 if (Inst.getOperand(i)->getType()->isScalableTy())
28377 return true;
28378
28379 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
28380 if (AI->getAllocatedType()->isScalableTy())
28381 return true;
28382 }
28383 }
28384
28385 // Checks to allow the use of SME instructions
28386 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
28387 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
28388 auto CalleeAttrs = SMEAttrs(*Base);
28389 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
28390 CallerAttrs.requiresLazySave(CalleeAttrs) ||
28391 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
28392 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs))
28393 return true;
28394 }
28395 return false;
28396}
28397
28398// Return the largest legal scalable vector type that matches VT's element type.
28402 "Expected legal fixed length vector!");
28403 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28404 default:
28405 llvm_unreachable("unexpected element type for SVE container");
28406 case MVT::i8:
28407 return EVT(MVT::nxv16i8);
28408 case MVT::i16:
28409 return EVT(MVT::nxv8i16);
28410 case MVT::i32:
28411 return EVT(MVT::nxv4i32);
28412 case MVT::i64:
28413 return EVT(MVT::nxv2i64);
28414 case MVT::bf16:
28415 return EVT(MVT::nxv8bf16);
28416 case MVT::f16:
28417 return EVT(MVT::nxv8f16);
28418 case MVT::f32:
28419 return EVT(MVT::nxv4f32);
28420 case MVT::f64:
28421 return EVT(MVT::nxv2f64);
28422 }
28423}
28424
28425// Return a PTRUE with active lanes corresponding to the extent of VT.
28427 EVT VT) {
28430 "Expected legal fixed length vector!");
28431
28432 std::optional<unsigned> PgPattern =
28434 assert(PgPattern && "Unexpected element count for SVE predicate");
28435
28436 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
28437 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
28438 // variants of instructions when available.
28439 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28440 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28441 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28442 if (MaxSVESize && MinSVESize == MaxSVESize &&
28443 MaxSVESize == VT.getSizeInBits())
28444 PgPattern = AArch64SVEPredPattern::all;
28445
28446 MVT MaskVT;
28447 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28448 default:
28449 llvm_unreachable("unexpected element type for SVE predicate");
28450 case MVT::i8:
28451 MaskVT = MVT::nxv16i1;
28452 break;
28453 case MVT::i16:
28454 case MVT::f16:
28455 case MVT::bf16:
28456 MaskVT = MVT::nxv8i1;
28457 break;
28458 case MVT::i32:
28459 case MVT::f32:
28460 MaskVT = MVT::nxv4i1;
28461 break;
28462 case MVT::i64:
28463 case MVT::f64:
28464 MaskVT = MVT::nxv2i1;
28465 break;
28466 }
28467
28468 return getPTrue(DAG, DL, MaskVT, *PgPattern);
28469}
28470
28472 EVT VT) {
28474 "Expected legal scalable vector!");
28475 auto PredTy = VT.changeVectorElementType(MVT::i1);
28476 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
28477}
28478
28480 if (VT.isFixedLengthVector())
28481 return getPredicateForFixedLengthVector(DAG, DL, VT);
28482
28483 return getPredicateForScalableVector(DAG, DL, VT);
28484}
28485
28486// Grow V to consume an entire SVE register.
28488 assert(VT.isScalableVector() &&
28489 "Expected to convert into a scalable vector!");
28490 assert(V.getValueType().isFixedLengthVector() &&
28491 "Expected a fixed length vector operand!");
28492 SDLoc DL(V);
28493 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28494 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
28495}
28496
28497// Shrink V so it's just big enough to maintain a VT's worth of data.
28500 "Expected to convert into a fixed length vector!");
28501 assert(V.getValueType().isScalableVector() &&
28502 "Expected a scalable vector operand!");
28503 SDLoc DL(V);
28504 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28505 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
28506}
28507
28508// Convert all fixed length vector loads larger than NEON to masked_loads.
28509SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
28510 SDValue Op, SelectionDAG &DAG) const {
28511 auto Load = cast<LoadSDNode>(Op);
28512
28513 SDLoc DL(Op);
28514 EVT VT = Op.getValueType();
28515 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28516 EVT LoadVT = ContainerVT;
28517 EVT MemVT = Load->getMemoryVT();
28518
28519 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28520
28521 if (VT.isFloatingPoint()) {
28522 LoadVT = ContainerVT.changeTypeToInteger();
28523 MemVT = MemVT.changeTypeToInteger();
28524 }
28525
28526 SDValue NewLoad = DAG.getMaskedLoad(
28527 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
28528 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
28529 Load->getAddressingMode(), Load->getExtensionType());
28530
28531 SDValue Result = NewLoad;
28532 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
28533 EVT ExtendVT = ContainerVT.changeVectorElementType(
28534 Load->getMemoryVT().getVectorElementType());
28535
28536 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
28538 Pg, Result, DAG.getUNDEF(ContainerVT));
28539 } else if (VT.isFloatingPoint()) {
28540 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
28541 }
28542
28543 Result = convertFromScalableVector(DAG, VT, Result);
28544 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
28545 return DAG.getMergeValues(MergedValues, DL);
28546}
28547
28549 SelectionDAG &DAG) {
28550 SDLoc DL(Mask);
28551 EVT InVT = Mask.getValueType();
28552 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28553
28554 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
28555
28556 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28557 return Pg;
28558
28559 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
28560 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
28561
28563 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
28564}
28565
28566// Convert all fixed length vector loads larger than NEON to masked_loads.
28567SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
28568 SDValue Op, SelectionDAG &DAG) const {
28569 auto Load = cast<MaskedLoadSDNode>(Op);
28570
28571 SDLoc DL(Op);
28572 EVT VT = Op.getValueType();
28573 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28574
28575 SDValue Mask = Load->getMask();
28576 // If this is an extending load and the mask type is not the same as
28577 // load's type then we have to extend the mask type.
28578 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
28579 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
28580 "Incorrect mask type");
28581 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
28582 }
28584
28585 SDValue PassThru;
28586 bool IsPassThruZeroOrUndef = false;
28587
28588 if (Load->getPassThru()->isUndef()) {
28589 PassThru = DAG.getUNDEF(ContainerVT);
28590 IsPassThruZeroOrUndef = true;
28591 } else {
28592 if (ContainerVT.isInteger())
28593 PassThru = DAG.getConstant(0, DL, ContainerVT);
28594 else
28595 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
28596 if (isZerosVector(Load->getPassThru().getNode()))
28597 IsPassThruZeroOrUndef = true;
28598 }
28599
28600 SDValue NewLoad = DAG.getMaskedLoad(
28601 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
28602 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
28603 Load->getAddressingMode(), Load->getExtensionType());
28604
28605 SDValue Result = NewLoad;
28606 if (!IsPassThruZeroOrUndef) {
28607 SDValue OldPassThru =
28608 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
28609 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
28610 }
28611
28612 Result = convertFromScalableVector(DAG, VT, Result);
28613 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
28614 return DAG.getMergeValues(MergedValues, DL);
28615}
28616
28617// Convert all fixed length vector stores larger than NEON to masked_stores.
28618SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
28619 SDValue Op, SelectionDAG &DAG) const {
28620 auto Store = cast<StoreSDNode>(Op);
28621
28622 SDLoc DL(Op);
28623 EVT VT = Store->getValue().getValueType();
28624 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28625 EVT MemVT = Store->getMemoryVT();
28626
28627 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28628 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
28629
28630 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
28631 EVT TruncVT = ContainerVT.changeVectorElementType(
28632 Store->getMemoryVT().getVectorElementType());
28633 MemVT = MemVT.changeTypeToInteger();
28634 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
28635 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
28636 DAG.getUNDEF(TruncVT));
28637 NewValue =
28638 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
28639 } else if (VT.isFloatingPoint()) {
28640 MemVT = MemVT.changeTypeToInteger();
28641 NewValue =
28642 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
28643 }
28644
28645 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
28646 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
28647 Store->getMemOperand(), Store->getAddressingMode(),
28648 Store->isTruncatingStore());
28649}
28650
28651SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
28652 SDValue Op, SelectionDAG &DAG) const {
28653 auto *Store = cast<MaskedStoreSDNode>(Op);
28654
28655 SDLoc DL(Op);
28656 EVT VT = Store->getValue().getValueType();
28657 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28658
28659 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
28661
28662 return DAG.getMaskedStore(
28663 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
28664 Mask, Store->getMemoryVT(), Store->getMemOperand(),
28665 Store->getAddressingMode(), Store->isTruncatingStore());
28666}
28667
28668SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
28669 SDValue Op, SelectionDAG &DAG) const {
28670 SDLoc dl(Op);
28671 EVT VT = Op.getValueType();
28672 EVT EltVT = VT.getVectorElementType();
28673
28674 bool Signed = Op.getOpcode() == ISD::SDIV;
28675 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
28676
28677 bool Negated;
28678 uint64_t SplatVal;
28679 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
28680 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28681 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
28682 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
28683
28684 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
28685 SDValue Res =
28686 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
28687 if (Negated)
28688 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
28689 DAG.getConstant(0, dl, ContainerVT), Res);
28690
28691 return convertFromScalableVector(DAG, VT, Res);
28692 }
28693
28694 // Scalable vector i32/i64 DIV is supported.
28695 if (EltVT == MVT::i32 || EltVT == MVT::i64)
28696 return LowerToPredicatedOp(Op, DAG, PredOpcode);
28697
28698 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
28699 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
28700 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
28701 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28702
28703 // If the wider type is legal: extend, op, and truncate.
28704 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
28705 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
28706 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
28707 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
28708 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
28709 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
28710 }
28711
28712 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
28713 &ExtendOpcode](SDValue Op) {
28714 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
28715 SDValue IdxHalf =
28716 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
28717 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
28718 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
28719 return std::pair<SDValue, SDValue>(
28720 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
28721 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
28722 };
28723
28724 // If wider type is not legal: split, extend, op, trunc and concat.
28725 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
28726 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
28727 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
28728 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
28729 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
28730 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
28731 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
28732}
28733
28734SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
28735 SDValue Op, SelectionDAG &DAG) const {
28736 EVT VT = Op.getValueType();
28737 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28738
28739 SDLoc DL(Op);
28740 SDValue Val = Op.getOperand(0);
28741 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
28742 Val = convertToScalableVector(DAG, ContainerVT, Val);
28743
28744 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
28745 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
28746
28747 // Repeatedly unpack Val until the result is of the desired element type.
28748 switch (ContainerVT.getSimpleVT().SimpleTy) {
28749 default:
28750 llvm_unreachable("unimplemented container type");
28751 case MVT::nxv16i8:
28752 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
28753 if (VT.getVectorElementType() == MVT::i16)
28754 break;
28755 [[fallthrough]];
28756 case MVT::nxv8i16:
28757 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
28758 if (VT.getVectorElementType() == MVT::i32)
28759 break;
28760 [[fallthrough]];
28761 case MVT::nxv4i32:
28762 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
28763 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
28764 break;
28765 }
28766
28767 return convertFromScalableVector(DAG, VT, Val);
28768}
28769
28770SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
28771 SDValue Op, SelectionDAG &DAG) const {
28772 EVT VT = Op.getValueType();
28773 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28774
28775 SDLoc DL(Op);
28776 SDValue Val = Op.getOperand(0);
28777 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
28778 Val = convertToScalableVector(DAG, ContainerVT, Val);
28779
28780 // Repeatedly truncate Val until the result is of the desired element type.
28781 switch (ContainerVT.getSimpleVT().SimpleTy) {
28782 default:
28783 llvm_unreachable("unimplemented container type");
28784 case MVT::nxv2i64:
28785 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
28786 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
28787 if (VT.getVectorElementType() == MVT::i32)
28788 break;
28789 [[fallthrough]];
28790 case MVT::nxv4i32:
28791 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
28792 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
28793 if (VT.getVectorElementType() == MVT::i16)
28794 break;
28795 [[fallthrough]];
28796 case MVT::nxv8i16:
28797 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
28798 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
28799 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
28800 break;
28801 }
28802
28803 return convertFromScalableVector(DAG, VT, Val);
28804}
28805
28806SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
28807 SDValue Op, SelectionDAG &DAG) const {
28808 EVT VT = Op.getValueType();
28809 EVT InVT = Op.getOperand(0).getValueType();
28810 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
28811
28812 SDLoc DL(Op);
28813 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28814 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
28815
28816 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
28817}
28818
28819SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
28820 SDValue Op, SelectionDAG &DAG) const {
28821 EVT VT = Op.getValueType();
28822 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28823
28824 SDLoc DL(Op);
28825 EVT InVT = Op.getOperand(0).getValueType();
28826 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28827 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
28828
28829 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
28830 Op.getOperand(1), Op.getOperand(2));
28831
28832 return convertFromScalableVector(DAG, VT, ScalableRes);
28833}
28834
28835// Convert vector operation 'Op' to an equivalent predicated operation whereby
28836// the original operation's type is used to construct a suitable predicate.
28837// NOTE: The results for inactive lanes are undefined.
28838SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
28839 SelectionDAG &DAG,
28840 unsigned NewOp) const {
28841 EVT VT = Op.getValueType();
28842 SDLoc DL(Op);
28843 auto Pg = getPredicateForVector(DAG, DL, VT);
28844
28845 if (VT.isFixedLengthVector()) {
28846 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
28847 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28848
28849 // Create list of operands by converting existing ones to scalable types.
28851 for (const SDValue &V : Op->op_values()) {
28852 if (isa<CondCodeSDNode>(V)) {
28853 Operands.push_back(V);
28854 continue;
28855 }
28856
28857 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
28858 EVT VTArg = VTNode->getVT().getVectorElementType();
28859 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
28860 Operands.push_back(DAG.getValueType(NewVTArg));
28861 continue;
28862 }
28863
28864 assert(isTypeLegal(V.getValueType()) &&
28865 "Expected only legal fixed-width types");
28866 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
28867 }
28868
28869 if (isMergePassthruOpcode(NewOp))
28870 Operands.push_back(DAG.getUNDEF(ContainerVT));
28871
28872 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
28873 return convertFromScalableVector(DAG, VT, ScalableRes);
28874 }
28875
28876 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
28877
28879 for (const SDValue &V : Op->op_values()) {
28880 assert((!V.getValueType().isVector() ||
28881 V.getValueType().isScalableVector()) &&
28882 "Only scalable vectors are supported!");
28883 Operands.push_back(V);
28884 }
28885
28886 if (isMergePassthruOpcode(NewOp))
28887 Operands.push_back(DAG.getUNDEF(VT));
28888
28889 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
28890}
28891
28892// If a fixed length vector operation has no side effects when applied to
28893// undefined elements, we can safely use scalable vectors to perform the same
28894// operation without needing to worry about predication.
28895SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
28896 SelectionDAG &DAG) const {
28897 EVT VT = Op.getValueType();
28899 "Only expected to lower fixed length vector operation!");
28900 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28901
28902 // Create list of operands by converting existing ones to scalable types.
28904 for (const SDValue &V : Op->op_values()) {
28905 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
28906
28907 // Pass through non-vector operands.
28908 if (!V.getValueType().isVector()) {
28909 Ops.push_back(V);
28910 continue;
28911 }
28912
28913 // "cast" fixed length vector to a scalable vector.
28914 assert(V.getValueType().isFixedLengthVector() &&
28915 isTypeLegal(V.getValueType()) &&
28916 "Only fixed length vectors are supported!");
28917 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
28918 }
28919
28920 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
28921 return convertFromScalableVector(DAG, VT, ScalableRes);
28922}
28923
28924SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
28925 SelectionDAG &DAG) const {
28926 SDLoc DL(ScalarOp);
28927 SDValue AccOp = ScalarOp.getOperand(0);
28928 SDValue VecOp = ScalarOp.getOperand(1);
28929 EVT SrcVT = VecOp.getValueType();
28930 EVT ResVT = SrcVT.getVectorElementType();
28931
28932 EVT ContainerVT = SrcVT;
28933 if (SrcVT.isFixedLengthVector()) {
28934 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
28935 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
28936 }
28937
28938 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
28939 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28940
28941 // Convert operands to Scalable.
28942 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
28943 DAG.getUNDEF(ContainerVT), AccOp, Zero);
28944
28945 // Perform reduction.
28946 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
28947 Pg, AccOp, VecOp);
28948
28949 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
28950}
28951
28952SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
28953 SelectionDAG &DAG) const {
28954 SDLoc DL(ReduceOp);
28955 SDValue Op = ReduceOp.getOperand(0);
28956 EVT OpVT = Op.getValueType();
28957 EVT VT = ReduceOp.getValueType();
28958
28959 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
28960 return SDValue();
28961
28962 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
28963
28964 switch (ReduceOp.getOpcode()) {
28965 default:
28966 return SDValue();
28967 case ISD::VECREDUCE_OR:
28968 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
28969 // The predicate can be 'Op' because
28970 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
28971 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
28972 else
28973 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
28974 case ISD::VECREDUCE_AND: {
28975 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
28976 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
28977 }
28978 case ISD::VECREDUCE_XOR: {
28979 SDValue ID =
28980 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
28981 if (OpVT == MVT::nxv1i1) {
28982 // Emulate a CNTP on .Q using .D and a different governing predicate.
28983 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
28984 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
28985 }
28986 SDValue Cntp =
28987 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
28988 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
28989 }
28990 }
28991
28992 return SDValue();
28993}
28994
28995SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
28996 SDValue ScalarOp,
28997 SelectionDAG &DAG) const {
28998 SDLoc DL(ScalarOp);
28999 SDValue VecOp = ScalarOp.getOperand(0);
29000 EVT SrcVT = VecOp.getValueType();
29001
29003 SrcVT,
29004 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
29005 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
29006 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
29007 }
29008
29009 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
29010 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
29011 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
29012 SDValue BoolVec = VecOp.getOperand(0);
29013 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
29014 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
29015 SDValue CntpOp = DAG.getNode(
29016 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
29017 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
29018 BoolVec, BoolVec);
29019 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
29020 }
29021 }
29022
29023 // UADDV always returns an i64 result.
29024 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
29025 SrcVT.getVectorElementType();
29026 EVT RdxVT = SrcVT;
29027 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
29028 RdxVT = getPackedSVEVectorVT(ResVT);
29029
29030 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
29031 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
29032 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
29033 Rdx, DAG.getConstant(0, DL, MVT::i64));
29034
29035 // The VEC_REDUCE nodes expect an element size result.
29036 if (ResVT != ScalarOp.getValueType())
29037 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
29038
29039 return Res;
29040}
29041
29042SDValue
29043AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
29044 SelectionDAG &DAG) const {
29045 EVT VT = Op.getValueType();
29046 SDLoc DL(Op);
29047
29048 EVT InVT = Op.getOperand(1).getValueType();
29049 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29050 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
29051 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
29052
29053 // Convert the mask to a predicated (NOTE: We don't need to worry about
29054 // inactive lanes since VSELECT is safe when given undefined elements).
29055 EVT MaskVT = Op.getOperand(0).getValueType();
29056 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
29057 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
29059 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
29060
29061 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
29062 Mask, Op1, Op2);
29063
29064 return convertFromScalableVector(DAG, VT, ScalableRes);
29065}
29066
29067SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
29068 SDValue Op, SelectionDAG &DAG) const {
29069 SDLoc DL(Op);
29070 EVT InVT = Op.getOperand(0).getValueType();
29071 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29072
29073 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
29074 "Only expected to lower fixed length vector operation!");
29075 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
29076 "Expected integer result of the same bit length as the inputs!");
29077
29078 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29079 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
29080 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
29081
29082 EVT CmpVT = Pg.getValueType();
29083 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
29084 {Pg, Op1, Op2, Op.getOperand(2)});
29085
29086 EVT PromoteVT = ContainerVT.changeTypeToInteger();
29087 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
29088 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
29089}
29090
29091SDValue
29092AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
29093 SelectionDAG &DAG) const {
29094 SDLoc DL(Op);
29095 auto SrcOp = Op.getOperand(0);
29096 EVT VT = Op.getValueType();
29097 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29098 EVT ContainerSrcVT =
29099 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
29100
29101 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
29102 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
29103 return convertFromScalableVector(DAG, VT, Op);
29104}
29105
29106SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
29107 SDValue Op, SelectionDAG &DAG) const {
29108 SDLoc DL(Op);
29109 unsigned NumOperands = Op->getNumOperands();
29110
29111 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
29112 "Unexpected number of operands in CONCAT_VECTORS");
29113
29114 auto SrcOp1 = Op.getOperand(0);
29115 auto SrcOp2 = Op.getOperand(1);
29116 EVT VT = Op.getValueType();
29117 EVT SrcVT = SrcOp1.getValueType();
29118
29119 if (NumOperands > 2) {
29121 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
29122 for (unsigned I = 0; I < NumOperands; I += 2)
29123 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
29124 Op->getOperand(I), Op->getOperand(I + 1)));
29125
29126 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
29127 }
29128
29129 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29130
29132 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
29133 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
29134
29135 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
29136
29137 return convertFromScalableVector(DAG, VT, Op);
29138}
29139
29140SDValue
29141AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
29142 SelectionDAG &DAG) const {
29143 EVT VT = Op.getValueType();
29144 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29145
29146 SDLoc DL(Op);
29147 SDValue Val = Op.getOperand(0);
29148 SDValue Pg = getPredicateForVector(DAG, DL, VT);
29149 EVT SrcVT = Val.getValueType();
29150 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29151 EVT ExtendVT = ContainerVT.changeVectorElementType(
29152 SrcVT.getVectorElementType());
29153
29154 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
29155 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
29156
29157 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
29158 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
29159 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29160 Pg, Val, DAG.getUNDEF(ContainerVT));
29161
29162 return convertFromScalableVector(DAG, VT, Val);
29163}
29164
29165SDValue
29166AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
29167 SelectionDAG &DAG) const {
29168 EVT VT = Op.getValueType();
29169 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29170
29171 SDLoc DL(Op);
29172 SDValue Val = Op.getOperand(0);
29173 EVT SrcVT = Val.getValueType();
29174 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29175 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
29177 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
29178
29179 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29180 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
29181 Op.getOperand(1), DAG.getUNDEF(RoundVT));
29182 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
29183 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
29184
29185 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29186 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29187}
29188
29189SDValue
29190AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
29191 SelectionDAG &DAG) const {
29192 EVT VT = Op.getValueType();
29193 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29194
29195 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
29196 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
29198
29199 SDLoc DL(Op);
29200 SDValue Val = Op.getOperand(0);
29201 EVT SrcVT = Val.getValueType();
29202 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29203 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29204
29205 if (VT.bitsGE(SrcVT)) {
29207
29208 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
29209 VT.changeTypeToInteger(), Val);
29210
29211 // Safe to use a larger than specified operand because by promoting the
29212 // value nothing has changed from an arithmetic point of view.
29213 Val =
29214 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
29215 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
29216 DAG.getUNDEF(ContainerDstVT));
29217 return convertFromScalableVector(DAG, VT, Val);
29218 } else {
29219 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
29220 ContainerDstVT.getVectorElementType());
29222
29223 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29224 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
29225 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
29226 Val = convertFromScalableVector(DAG, SrcVT, Val);
29227
29228 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29229 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29230 }
29231}
29232
29233SDValue
29234AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
29235 SelectionDAG &DAG) const {
29236 SDLoc DL(Op);
29237 EVT OpVT = Op.getValueType();
29238 assert(OpVT.isScalableVector() &&
29239 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
29240 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
29241 Op.getOperand(1));
29242 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
29243 Op.getOperand(1));
29244 return DAG.getMergeValues({Even, Odd}, DL);
29245}
29246
29247SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
29248 SelectionDAG &DAG) const {
29249 SDLoc DL(Op);
29250 EVT OpVT = Op.getValueType();
29251 assert(OpVT.isScalableVector() &&
29252 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
29253
29254 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
29255 Op.getOperand(1));
29256 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
29257 Op.getOperand(1));
29258 return DAG.getMergeValues({Lo, Hi}, DL);
29259}
29260
29261SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
29262 SelectionDAG &DAG) const {
29263 // FIXME: Maybe share some code with LowerMGather/Scatter?
29264 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
29265 SDLoc DL(HG);
29266 SDValue Chain = HG->getChain();
29267 SDValue Inc = HG->getInc();
29268 SDValue Mask = HG->getMask();
29269 SDValue Ptr = HG->getBasePtr();
29270 SDValue Index = HG->getIndex();
29271 SDValue Scale = HG->getScale();
29272 SDValue IntID = HG->getIntID();
29273
29274 // The Intrinsic ID determines the type of update operation.
29275 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
29276 // Right now, we only support 'add' as an update.
29277 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
29278 "Unexpected histogram update operation");
29279
29280 EVT IndexVT = Index.getValueType();
29281 LLVMContext &Ctx = *DAG.getContext();
29283 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
29284 EVT IncExtVT =
29285 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
29286 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
29287 bool ExtTrunc = IncSplatVT != MemVT;
29288
29289 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29290 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
29291 SDValue IncSplat = DAG.getSplatVector(
29292 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
29293 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
29294
29295 MachineMemOperand *MMO = HG->getMemOperand();
29296 // Create an MMO for the gather, without load|store flags.
29299 MMO->getAlign(), MMO->getAAInfo());
29300 ISD::MemIndexType IndexType = HG->getIndexType();
29301 SDValue Gather = DAG.getMaskedGather(
29302 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
29303 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
29304
29305 SDValue GChain = Gather.getValue(1);
29306
29307 // Perform the histcnt, multiply by inc, add to bucket data.
29308 SDValue ID =
29309 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
29310 SDValue HistCnt =
29311 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
29312 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
29313 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
29314
29315 // Create an MMO for the scatter, without load|store flags.
29318 MMO->getAlign(), MMO->getAAInfo());
29319
29320 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
29321 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
29322 ScatterOps, SMMO, IndexType, ExtTrunc);
29323 return Scatter;
29324}
29325
29326SDValue
29327AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
29328 SelectionDAG &DAG) const {
29329 EVT VT = Op.getValueType();
29330 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29331
29332 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
29333 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
29335
29336 SDLoc DL(Op);
29337 SDValue Val = Op.getOperand(0);
29338 EVT SrcVT = Val.getValueType();
29339 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29340 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29341
29342 if (VT.bitsGT(SrcVT)) {
29343 EVT CvtVT = ContainerDstVT.changeVectorElementType(
29344 ContainerSrcVT.getVectorElementType());
29346
29347 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
29348 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
29349
29350 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
29351 Val = getSVESafeBitCast(CvtVT, Val, DAG);
29352 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
29353 DAG.getUNDEF(ContainerDstVT));
29354 return convertFromScalableVector(DAG, VT, Val);
29355 } else {
29356 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
29358
29359 // Safe to use a larger than specified result since an fp_to_int where the
29360 // result doesn't fit into the destination is undefined.
29361 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29362 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
29363 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
29364
29365 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
29366 }
29367}
29368
29370 ArrayRef<int> ShuffleMask, EVT VT,
29371 EVT ContainerVT, SelectionDAG &DAG) {
29372 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29373 SDLoc DL(Op);
29374 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29375 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29376 bool IsSingleOp =
29377 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
29378
29379 if (!Subtarget.isNeonAvailable() && !MinSVESize)
29380 MinSVESize = 128;
29381
29382 // Ignore two operands if no SVE2 or all index numbers couldn't
29383 // be represented.
29384 if (!IsSingleOp && !Subtarget.hasSVE2())
29385 return SDValue();
29386
29387 EVT VTOp1 = Op.getOperand(0).getValueType();
29388 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
29389 unsigned IndexLen = MinSVESize / BitsPerElt;
29390 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
29391 uint64_t MaxOffset = maxUIntN(BitsPerElt);
29392 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
29393 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
29394 bool MinMaxEqual = (MinSVESize == MaxSVESize);
29395 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
29396 "Incorrectly legalised shuffle operation");
29397
29399 // If MinSVESize is not equal to MaxSVESize then we need to know which
29400 // TBL mask element needs adjustment.
29401 SmallVector<SDValue, 8> AddRuntimeVLMask;
29402
29403 // Bail out for 8-bits element types, because with 2048-bit SVE register
29404 // size 8 bits is only sufficient to index into the first source vector.
29405 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
29406 return SDValue();
29407
29408 for (int Index : ShuffleMask) {
29409 // Handling poison index value.
29410 if (Index < 0)
29411 Index = 0;
29412 // If the mask refers to elements in the second operand, then we have to
29413 // offset the index by the number of elements in a vector. If this is number
29414 // is not known at compile-time, we need to maintain a mask with 'VL' values
29415 // to add at runtime.
29416 if ((unsigned)Index >= ElementsPerVectorReg) {
29417 if (MinMaxEqual) {
29418 Index += IndexLen - ElementsPerVectorReg;
29419 } else {
29420 Index = Index - ElementsPerVectorReg;
29421 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
29422 }
29423 } else if (!MinMaxEqual)
29424 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
29425 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
29426 // to 255, this might point to the last element of in the second operand
29427 // of the shufflevector, thus we are rejecting this transform.
29428 if ((unsigned)Index >= MaxOffset)
29429 return SDValue();
29430 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
29431 }
29432
29433 // Choosing an out-of-range index leads to the lane being zeroed vs zero
29434 // value where it would perform first lane duplication for out of
29435 // index elements. For i8 elements an out-of-range index could be a valid
29436 // for 2048-bit vector register size.
29437 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
29438 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
29439 if (!MinMaxEqual)
29440 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
29441 }
29442
29443 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
29444 SDValue VecMask =
29445 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
29446 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
29447
29448 SDValue Shuffle;
29449 if (IsSingleOp)
29450 Shuffle =
29451 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
29452 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
29453 Op1, SVEMask);
29454 else if (Subtarget.hasSVE2()) {
29455 if (!MinMaxEqual) {
29456 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
29457 SDValue VScale = (BitsPerElt == 64)
29458 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
29459 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
29460 SDValue VecMask =
29461 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
29462 SDValue MulByMask = DAG.getNode(
29463 ISD::MUL, DL, MaskType,
29464 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
29465 DAG.getBuildVector(MaskType, DL,
29466 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
29467 SDValue UpdatedVecMask =
29468 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
29469 SVEMask = convertToScalableVector(
29470 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
29471 }
29472 Shuffle =
29473 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
29474 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
29475 Op1, Op2, SVEMask);
29476 }
29477 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
29478 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
29479}
29480
29481SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
29482 SDValue Op, SelectionDAG &DAG) const {
29483 EVT VT = Op.getValueType();
29484 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29485
29486 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
29487 auto ShuffleMask = SVN->getMask();
29488
29489 SDLoc DL(Op);
29490 SDValue Op1 = Op.getOperand(0);
29491 SDValue Op2 = Op.getOperand(1);
29492
29493 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29494 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
29495 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
29496
29497 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
29498 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
29499 return MVT::i32;
29500 return ScalarTy;
29501 };
29502
29503 if (SVN->isSplat()) {
29504 unsigned Lane = std::max(0, SVN->getSplatIndex());
29505 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29506 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
29507 DAG.getConstant(Lane, DL, MVT::i64));
29508 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
29509 return convertFromScalableVector(DAG, VT, Op);
29510 }
29511
29512 bool ReverseEXT = false;
29513 unsigned Imm;
29514 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
29515 Imm == VT.getVectorNumElements() - 1) {
29516 if (ReverseEXT)
29517 std::swap(Op1, Op2);
29518 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29519 SDValue Scalar = DAG.getNode(
29520 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
29521 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
29522 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
29523 return convertFromScalableVector(DAG, VT, Op);
29524 }
29525
29526 unsigned EltSize = VT.getScalarSizeInBits();
29527 for (unsigned LaneSize : {64U, 32U, 16U}) {
29528 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
29529 EVT NewVT =
29531 unsigned RevOp;
29532 if (EltSize == 8)
29534 else if (EltSize == 16)
29536 else
29538
29539 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
29540 Op = LowerToPredicatedOp(Op, DAG, RevOp);
29541 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
29542 return convertFromScalableVector(DAG, VT, Op);
29543 }
29544 }
29545
29546 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
29547 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
29548 if (!VT.isFloatingPoint())
29549 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
29550
29552 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
29553 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
29554 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
29555 return convertFromScalableVector(DAG, VT, Op);
29556 }
29557
29558 unsigned WhichResult;
29559 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
29560 WhichResult == 0)
29562 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
29563
29564 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
29565 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29567 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
29568 }
29569
29570 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
29572 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
29573
29574 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
29575 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29577 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
29578 }
29579
29580 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
29581 // represents the same logical operation as performed by a ZIP instruction. In
29582 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
29583 // equivalent to an AArch64 instruction. There's the extra component of
29584 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
29585 // only operated on 64/128bit vector types that have a direct mapping to a
29586 // target register and so an exact mapping is implied.
29587 // However, when using SVE for fixed length vectors, most legal vector types
29588 // are actually sub-vectors of a larger SVE register. When mapping
29589 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
29590 // how the mask's indices translate. Specifically, when the mapping requires
29591 // an exact meaning for a specific vector index (e.g. Index X is the last
29592 // vector element in the register) then such mappings are often only safe when
29593 // the exact SVE register size is know. The main exception to this is when
29594 // indices are logically relative to the first element of either
29595 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
29596 // when converting from fixed-length to scalable vector types (i.e. the start
29597 // of a fixed length vector is always the start of a scalable vector).
29598 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
29599 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
29600 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
29601 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
29602 Op2.isUndef()) {
29603 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
29604 return convertFromScalableVector(DAG, VT, Op);
29605 }
29606
29607 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
29608 WhichResult != 0)
29610 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
29611
29612 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
29613 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
29615 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
29616 }
29617
29618 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
29620 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
29621
29622 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
29623 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
29625 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
29626 }
29627 }
29628
29629 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
29630 // This may allow the shuffle to be matched as something cheaper like ZIP1.
29631 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
29632 return WideOp;
29633
29634 // Avoid producing TBL instruction if we don't know SVE register minimal size,
29635 // unless NEON is not available and we can assume minimal SVE register size is
29636 // 128-bits.
29637 if (MinSVESize || !Subtarget->isNeonAvailable())
29638 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
29639 DAG);
29640
29641 return SDValue();
29642}
29643
29644SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
29645 SelectionDAG &DAG) const {
29646 SDLoc DL(Op);
29647 EVT InVT = Op.getValueType();
29648
29649 assert(VT.isScalableVector() && isTypeLegal(VT) &&
29650 InVT.isScalableVector() && isTypeLegal(InVT) &&
29651 "Only expect to cast between legal scalable vector types!");
29652 assert(VT.getVectorElementType() != MVT::i1 &&
29653 InVT.getVectorElementType() != MVT::i1 &&
29654 "For predicate bitcasts, use getSVEPredicateBitCast");
29655
29656 if (InVT == VT)
29657 return Op;
29658
29660 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
29661
29662 // Safe bitcasting between unpacked vector types of different element counts
29663 // is currently unsupported because the following is missing the necessary
29664 // work to ensure the result's elements live where they're supposed to within
29665 // an SVE register.
29666 // 01234567
29667 // e.g. nxv2i32 = XX??XX??
29668 // nxv4f16 = X?X?X?X?
29670 VT == PackedVT || InVT == PackedInVT) &&
29671 "Unexpected bitcast!");
29672
29673 // Pack input if required.
29674 if (InVT != PackedInVT)
29675 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
29676
29677 if (Subtarget->isLittleEndian() ||
29678 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
29679 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
29680 else {
29681 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
29682 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
29683
29684 // Simulate the effect of casting through memory.
29685 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
29686 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
29687 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
29688 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
29689 if (PackedVTAsInt.getScalarSizeInBits() != 8)
29690 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
29691 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
29692 }
29693
29694 // Unpack result if required.
29695 if (VT != PackedVT)
29697
29698 return Op;
29699}
29700
29702 SDValue N) const {
29703 return ::isAllActivePredicate(DAG, N);
29704}
29705
29707 return ::getPromotedVTForPredicate(VT);
29708}
29709
29710bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
29711 SDValue Op, const APInt &OriginalDemandedBits,
29712 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
29713 unsigned Depth) const {
29714
29715 unsigned Opc = Op.getOpcode();
29716 switch (Opc) {
29717 case AArch64ISD::VSHL: {
29718 // Match (VSHL (VLSHR Val X) X)
29719 SDValue ShiftL = Op;
29720 SDValue ShiftR = Op->getOperand(0);
29721 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
29722 return false;
29723
29724 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
29725 return false;
29726
29727 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
29728 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
29729
29730 // Other cases can be handled as well, but this is not
29731 // implemented.
29732 if (ShiftRBits != ShiftLBits)
29733 return false;
29734
29735 unsigned ScalarSize = Op.getScalarValueSizeInBits();
29736 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
29737
29738 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
29739 APInt UnusedBits = ~OriginalDemandedBits;
29740
29741 if ((ZeroBits & UnusedBits) != ZeroBits)
29742 return false;
29743
29744 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
29745 // used - simplify to just Val.
29746 return TLO.CombineTo(Op, ShiftR->getOperand(0));
29747 }
29748 case AArch64ISD::BICi: {
29749 // Fold BICi if all destination bits already known to be zeroed
29750 SDValue Op0 = Op.getOperand(0);
29751 KnownBits KnownOp0 =
29752 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
29753 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
29754 APInt BitsToClear =
29755 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
29756 .trunc(KnownOp0.getBitWidth());
29757 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
29758 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
29759 return TLO.CombineTo(Op, Op0);
29760
29761 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
29762 return false;
29763 }
29765 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
29766 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
29767 if (!MaxSVEVectorSizeInBits)
29768 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
29769 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
29770 // The SVE count intrinsics don't support the multiplier immediate so we
29771 // don't have to account for that here. The value returned may be slightly
29772 // over the true required bits, as this is based on the "ALL" pattern. The
29773 // other patterns are also exposed by these intrinsics, but they all
29774 // return a value that's strictly less than "ALL".
29775 unsigned RequiredBits = llvm::bit_width(MaxElements);
29776 unsigned BitWidth = Known.Zero.getBitWidth();
29777 if (RequiredBits < BitWidth)
29778 Known.Zero.setHighBits(BitWidth - RequiredBits);
29779 return false;
29780 }
29781 }
29782 }
29783
29785 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
29786}
29787
29788bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
29789 return Op.getOpcode() == AArch64ISD::DUP ||
29790 Op.getOpcode() == AArch64ISD::MOVI ||
29791 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
29792 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
29794}
29795
29797 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
29798 Subtarget->hasComplxNum();
29799}
29800
29803 auto *VTy = dyn_cast<VectorType>(Ty);
29804 if (!VTy)
29805 return false;
29806
29807 // If the vector is scalable, SVE is enabled, implying support for complex
29808 // numbers. Otherwise, we need to ensure complex number support is available
29809 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
29810 return false;
29811
29812 auto *ScalarTy = VTy->getScalarType();
29813 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
29814
29815 // We can only process vectors that have a bit size of 128 or higher (with an
29816 // additional 64 bits for Neon). Additionally, these vectors must have a
29817 // power-of-2 size, as we later split them into the smallest supported size
29818 // and merging them back together after applying complex operation.
29819 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
29820 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
29821 !llvm::isPowerOf2_32(VTyWidth))
29822 return false;
29823
29824 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
29825 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
29826
29828 return ScalarWidth == 32 || ScalarWidth == 64;
29829 return 8 <= ScalarWidth && ScalarWidth <= 64;
29830 }
29831
29832 // CDot is not supported outside of scalable/sve scopes
29834 return false;
29835
29836 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
29837 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
29838}
29839
29842 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
29843 Value *Accumulator) const {
29844 VectorType *Ty = cast<VectorType>(InputA->getType());
29845 if (Accumulator == nullptr)
29847 bool IsScalable = Ty->isScalableTy();
29848 bool IsInt = Ty->getElementType()->isIntegerTy();
29849
29850 unsigned TyWidth =
29852
29853 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
29854 "Vector type must be either 64 or a power of 2 that is at least 128");
29855
29856 if (TyWidth > 128) {
29857 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
29858 int AccStride = cast<VectorType>(Accumulator->getType())
29859 ->getElementCount()
29860 .getKnownMinValue() /
29861 2;
29862 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
29863 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
29864 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
29865 auto *UpperSplitA =
29866 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
29867 auto *UpperSplitB =
29868 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
29869 Value *LowerSplitAcc = nullptr;
29870 Value *UpperSplitAcc = nullptr;
29871 Type *FullTy = Ty;
29872 FullTy = Accumulator->getType();
29873 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
29874 cast<VectorType>(Accumulator->getType()));
29875 LowerSplitAcc =
29876 B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
29877 UpperSplitAcc =
29878 B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
29879 auto *LowerSplitInt = createComplexDeinterleavingIR(
29880 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
29881 auto *UpperSplitInt = createComplexDeinterleavingIR(
29882 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
29883
29884 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
29885 LowerSplitInt, B.getInt64(0));
29886 return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
29887 B.getInt64(AccStride));
29888 }
29889
29890 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
29891 if (IsScalable) {
29892 if (IsInt)
29893 return B.CreateIntrinsic(
29894 Intrinsic::aarch64_sve_cmla_x, Ty,
29895 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29896
29897 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
29898 return B.CreateIntrinsic(
29899 Intrinsic::aarch64_sve_fcmla, Ty,
29900 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29901 }
29902
29903 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
29904 Intrinsic::aarch64_neon_vcmla_rot90,
29905 Intrinsic::aarch64_neon_vcmla_rot180,
29906 Intrinsic::aarch64_neon_vcmla_rot270};
29907
29908
29909 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
29910 {Accumulator, InputA, InputB});
29911 }
29912
29913 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
29914 if (IsScalable) {
29917 if (IsInt)
29918 return B.CreateIntrinsic(
29919 Intrinsic::aarch64_sve_cadd_x, Ty,
29920 {InputA, InputB, B.getInt32((int)Rotation * 90)});
29921
29922 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
29923 return B.CreateIntrinsic(
29924 Intrinsic::aarch64_sve_fcadd, Ty,
29925 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
29926 }
29927 return nullptr;
29928 }
29929
29932 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
29934 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
29935
29936 if (IntId == Intrinsic::not_intrinsic)
29937 return nullptr;
29938
29939 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
29940 }
29941
29942 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
29943 IsScalable) {
29944 return B.CreateIntrinsic(
29945 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
29946 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29947 }
29948
29949 return nullptr;
29950}
29951
29952bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
29953 unsigned Opc = N->getOpcode();
29954 if (ISD::isExtOpcode(Opc)) {
29955 if (any_of(N->users(),
29956 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
29957 return false;
29958 }
29959 return true;
29960}
29961
29962unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
29963 return Subtarget->getMinimumJumpTableEntries();
29964}
29965
29968 EVT VT) const {
29969 bool NonUnitFixedLengthVector =
29971 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
29973
29974 EVT VT1;
29975 MVT RegisterVT;
29976 unsigned NumIntermediates;
29977 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
29978 RegisterVT);
29979 return RegisterVT;
29980}
29981
29983 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
29984 bool NonUnitFixedLengthVector =
29986 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
29988
29989 EVT VT1;
29990 MVT VT2;
29991 unsigned NumIntermediates;
29992 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
29993 NumIntermediates, VT2);
29994}
29995
29997 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
29998 unsigned &NumIntermediates, MVT &RegisterVT) const {
30000 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
30001 if (!RegisterVT.isFixedLengthVector() ||
30002 RegisterVT.getFixedSizeInBits() <= 128)
30003 return NumRegs;
30004
30005 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
30006 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
30007 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
30008
30009 // A size mismatch here implies either type promotion or widening and would
30010 // have resulted in scalarisation if larger vectors had not be available.
30011 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
30012 EVT EltTy = VT.getVectorElementType();
30013 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
30014 if (!isTypeLegal(NewVT))
30015 NewVT = EltTy;
30016
30017 IntermediateVT = NewVT;
30018 NumIntermediates = VT.getVectorNumElements();
30019 RegisterVT = getRegisterType(Context, NewVT);
30020 return NumIntermediates;
30021 }
30022
30023 // SVE VLS support does not introduce a new ABI so we should use NEON sized
30024 // types for vector arguments and returns.
30025
30026 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
30027 NumIntermediates *= NumSubRegs;
30028 NumRegs *= NumSubRegs;
30029
30030 switch (RegisterVT.getVectorElementType().SimpleTy) {
30031 default:
30032 llvm_unreachable("unexpected element type for vector");
30033 case MVT::i8:
30034 IntermediateVT = RegisterVT = MVT::v16i8;
30035 break;
30036 case MVT::i16:
30037 IntermediateVT = RegisterVT = MVT::v8i16;
30038 break;
30039 case MVT::i32:
30040 IntermediateVT = RegisterVT = MVT::v4i32;
30041 break;
30042 case MVT::i64:
30043 IntermediateVT = RegisterVT = MVT::v2i64;
30044 break;
30045 case MVT::f16:
30046 IntermediateVT = RegisterVT = MVT::v8f16;
30047 break;
30048 case MVT::f32:
30049 IntermediateVT = RegisterVT = MVT::v4f32;
30050 break;
30051 case MVT::f64:
30052 IntermediateVT = RegisterVT = MVT::v2f64;
30053 break;
30054 case MVT::bf16:
30055 IntermediateVT = RegisterVT = MVT::v8bf16;
30056 break;
30057 }
30058
30059 return NumRegs;
30060}
30061
30063 const MachineFunction &MF) const {
30064 return !Subtarget->isTargetWindows() &&
30065 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
30066}
30067
30069 switch (Opc) {
30073 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
30074 return true;
30075 }
30076
30078}
30079
30080#ifndef NDEBUG
30082 switch (N->getOpcode()) {
30083 default:
30084 break;
30085 case AArch64ISD::SADDWT:
30086 case AArch64ISD::SADDWB:
30087 case AArch64ISD::UADDWT:
30088 case AArch64ISD::UADDWB: {
30089 assert(N->getNumValues() == 1 && "Expected one result!");
30090 assert(N->getNumOperands() == 2 && "Expected two operands!");
30091 EVT VT = N->getValueType(0);
30092 EVT Op0VT = N->getOperand(0).getValueType();
30093 EVT Op1VT = N->getOperand(1).getValueType();
30094 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
30095 VT.isInteger() && Op0VT.isInteger() && Op1VT.isInteger() &&
30096 "Expected integer vectors!");
30097 assert(VT == Op0VT &&
30098 "Expected result and first input to have the same type!");
30099 assert(Op0VT.getSizeInBits() == Op1VT.getSizeInBits() &&
30100 "Expected vectors of equal size!");
30101 assert(Op0VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount() &&
30102 "Expected result vector and first input vector to have half the "
30103 "lanes of the second input vector!");
30104 break;
30105 }
30109 case AArch64ISD::UUNPKHI: {
30110 assert(N->getNumValues() == 1 && "Expected one result!");
30111 assert(N->getNumOperands() == 1 && "Expected one operand!");
30112 EVT VT = N->getValueType(0);
30113 EVT OpVT = N->getOperand(0).getValueType();
30114 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
30115 VT.isInteger() && "Expected integer vectors!");
30116 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
30117 "Expected vectors of equal size!");
30119 "Expected result vector with half the lanes of its input!");
30120 break;
30121 }
30122 case AArch64ISD::TRN1:
30123 case AArch64ISD::TRN2:
30124 case AArch64ISD::UZP1:
30125 case AArch64ISD::UZP2:
30126 case AArch64ISD::ZIP1:
30127 case AArch64ISD::ZIP2: {
30128 assert(N->getNumValues() == 1 && "Expected one result!");
30129 assert(N->getNumOperands() == 2 && "Expected two operands!");
30130 EVT VT = N->getValueType(0);
30131 EVT Op0VT = N->getOperand(0).getValueType();
30132 EVT Op1VT = N->getOperand(1).getValueType();
30133 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
30134 "Expected vectors!");
30135 assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
30136 break;
30137 }
30138 case AArch64ISD::RSHRNB_I: {
30139 assert(N->getNumValues() == 1 && "Expected one result!");
30140 assert(N->getNumOperands() == 2 && "Expected two operands!");
30141 EVT VT = N->getValueType(0);
30142 EVT Op0VT = N->getOperand(0).getValueType();
30143 EVT Op1VT = N->getOperand(1).getValueType();
30144 assert(VT.isVector() && VT.isInteger() &&
30145 "Expected integer vector result type!");
30146 assert(Op0VT.isVector() && Op0VT.isInteger() &&
30147 "Expected first operand to be an integer vector!");
30148 assert(VT.getSizeInBits() == Op0VT.getSizeInBits() &&
30149 "Expected vectors of equal size!");
30151 "Expected input vector with half the lanes of its result!");
30152 assert(Op1VT == MVT::i32 && isa<ConstantSDNode>(N->getOperand(1)) &&
30153 "Expected second operand to be a constant i32!");
30154 break;
30155 }
30156 }
30157}
30158#endif
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
SDValue tryLowerPartialReductionToWideAdd(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parsePredicateRegAsConstraint(StringRef Constraint)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue tryCombineWhileLo(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc dl)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
bool shouldUseFormStridedPseudo(MachineInstr &MI)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static bool isConstant(const MachineInstr &MI)
static const LLT S1
static const LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static void replaceAllUsesWith(Value *Old, Value *New, SmallSet< BasicBlock *, 32 > &FreshBBs, bool IsHuge)
Replace all old uses with new ones, and push the updated BBs into FreshBBs.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:235
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static Split data
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setSMESaveBufferUsed(bool Used=true)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
std::optional< uint16_t > getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const
Compute the integer discriminator for a given BlockAddress constant, if blockaddress signing is enabl...
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
bool isStreamingSVEAvailable() const
Returns true if the target has access to the streaming-compatible subset of SVE instructions.
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
const char * getChkStkName() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool hasCustomCallingConv() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override
Return true if the @llvm.experimental.vector.partial.reduce.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleaveIntrinsicToStore(StoreInst *SI, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI, ArrayRef< Value * > DeinterleaveValues) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1864
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1007
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1902
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1166
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1909
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:959
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
an instruction to allocate memory on the stack
Definition: Instructions.h:63
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ And
*p = old & v
Definition: Instructions.h:724
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
bool isFloatingPointOperation() const
Definition: Instructions.h:882
BinOp getOperation() const
Definition: Instructions.h:805
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
const BlockAddress * getBlockAddress() const
The address of a basic block.
Definition: Constants.h:893
Function * getFunction() const
Definition: Constants.h:923
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:197
IntegerType * getIntPtrType(LLVMContext &C, unsigned AddressSpace=0) const
Returns an integer type with size at least as big as that of a pointer in the given address space.
Definition: DataLayout.cpp:851
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition: DerivedTypes.h:575
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:719
bool empty() const
Definition: Function.h:871
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:716
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1048
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:365
arg_iterator arg_end()
Definition: Function.h:889
arg_iterator arg_begin()
Definition: Function.h:880
size_t size() const
Definition: Function.h:870
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:264
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:530
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:130
Type * getValueType() const
Definition: GlobalValue.h:297
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2162
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1072
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1887
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2106
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1163
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2555
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2079
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1480
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:495
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2093
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1459
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2033
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2019
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1540
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1499
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:200
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
Type * getPointerOperandType() const
Definition: Instructions.h:258
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static constexpr unsigned NoRegister
Definition: MCRegister.h:52
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:712
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:354
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
bool hasSharedZAInterface() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:610
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:812
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:761
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:505
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:713
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:891
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, SDValue Op2)
Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are its operands and ReducedTY i...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:797
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:286
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:470
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:571
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition: StringRef.h:609
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:684
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:80
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:668
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:261
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
@ HalfTyID
16-bit floating point type
Definition: Type.h:56
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition: Type.h:57
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
static IntegerType * getInt16Ty(LLVMContext &C)
bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt128Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:64
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5304
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:531
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:478
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:496
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
self_iterator getIterator()
Definition: ilist_node.h:132
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:271
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition: CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:491
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1417
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1360
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1450
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1226
@ ConstantFP
Definition: ISDOpcodes.h:77
@ STRICT_FATAN2
Definition: ISDOpcodes.h:428
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ STRICT_FCEIL
Definition: ISDOpcodes.h:441
@ STRICT_FTANH
Definition: ISDOpcodes.h:431
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1092
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:451
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1435
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1439
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1096
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1449
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:492
@ STRICT_FLOG2
Definition: ISDOpcodes.h:436
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1270
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1494
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ STRICT_FASIN
Definition: ISDOpcodes.h:425
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:685
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:465
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1432
@ STRICT_FATAN
Definition: ISDOpcodes.h:427
@ WRITE_REGISTER
Definition: ISDOpcodes.h:125
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ TRUNCATE_SSAT_U
Definition: ISDOpcodes.h:834
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1436
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:788
@ STRICT_LROUND
Definition: ISDOpcodes.h:446
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:601
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:661
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ STRICT_FPOWI
Definition: ISDOpcodes.h:420
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1451
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:642
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1222
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:445
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1444
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1337
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition: ISDOpcodes.h:90
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:450
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:439
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:124
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:440
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ STRICT_FSINH
Definition: ISDOpcodes.h:429
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1407
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1286
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ STRICT_LRINT
Definition: ISDOpcodes.h:448
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:606
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ STRICT_FROUND
Definition: ISDOpcodes.h:443
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:464
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1372
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1452
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:442
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:444
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:458
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:457
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ STRICT_FCOSH
Definition: ISDOpcodes.h:430
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:680
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:435
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:449
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:627
@ STRICT_FEXP2
Definition: ISDOpcodes.h:433
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:669
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:112
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ STRICT_LLROUND
Definition: ISDOpcodes.h:447
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:882
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1481
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:438
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1440
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1217
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:437
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:595
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition: ISDOpcodes.h:832
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:692
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1276
@ TRUNCATE_USAT_U
Definition: ISDOpcodes.h:836
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ STRICT_FACOS
Definition: ISDOpcodes.h:426
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1681
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1572
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1559
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1590
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1561
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:732
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:256
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:360
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:297
bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1547
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:347
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:274
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:261
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2099
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:220
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:315
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:397
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:439
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:187
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:428
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:153
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:43
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:370
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:288
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:303
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition: KnownBits.h:127
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:285
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64