LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
60#include "llvm/IR/Attributes.h"
61#include "llvm/IR/Constants.h"
62#include "llvm/IR/DataLayout.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/IR/Function.h"
67#include "llvm/IR/GlobalValue.h"
68#include "llvm/IR/IRBuilder.h"
69#include "llvm/IR/Instruction.h"
72#include "llvm/IR/Intrinsics.h"
73#include "llvm/IR/IntrinsicsAArch64.h"
74#include "llvm/IR/Module.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/Use.h"
78#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add legal sve data types
449 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
451 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
452 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
453
454 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
456 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
457 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
460
461 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
463 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
464
465 if (Subtarget->useSVEForFixedLengthVectors()) {
468 addRegisterClass(VT, &AArch64::ZPRRegClass);
469
472 addRegisterClass(VT, &AArch64::ZPRRegClass);
473 }
474 }
475
476 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
477 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
478 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
479 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
480
481 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
482 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
483 }
484
485 // Compute derived properties from the register classes
487
488 // Provide all sorts of operation actions
514 if (Subtarget->hasFPARMv8()) {
517 }
530
532
536
540
542
543 // Custom lowering hooks are needed for XOR
544 // to fold it into CSINC/CSINV.
547
550
551 // Virtually no operation on f128 is legal, but LLVM can't expand them when
552 // there's a valid register class, so we need custom operations in most cases.
577 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
578 // aren't handled.
579
580 // Lowering for many of the conversions is actually specified by the non-f128
581 // type. The LowerXXX function will be trivial when f128 isn't involved.
606 if (Subtarget->hasFPARMv8()) {
609 }
612 if (Subtarget->hasFPARMv8()) {
615 }
618
623
624 // Variable arguments.
629
630 // Variable-sized objects.
633
634 // Lowering Funnel Shifts to EXTR
639
641
642 // Constant pool entries
644
645 // BlockAddress
647
648 // AArch64 lacks both left-rotate and popcount instructions.
654 }
655
656 // AArch64 doesn't have i32 MULH{S|U}.
659
660 // AArch64 doesn't have {U|S}MUL_LOHI.
665
666 if (Subtarget->hasCSSC()) {
670
672
676
679
684
689 } else {
693
696
699 }
700
706 }
713
714 // Custom lower Add/Sub/Mul with overflow.
727
736
745 if (Subtarget->hasFullFP16()) {
748 } else {
751 }
752
753 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
767 setOperationAction(Op, MVT::f16, Promote);
768 setOperationAction(Op, MVT::v4f16, Expand);
769 setOperationAction(Op, MVT::v8f16, Expand);
770 setOperationAction(Op, MVT::bf16, Promote);
771 setOperationAction(Op, MVT::v4bf16, Expand);
772 setOperationAction(Op, MVT::v8bf16, Expand);
773 }
774
775 // Legalize fcanonicalize to circumvent default expansion
776 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
777 if (Subtarget->hasFullFP16()) {
779 }
780
781 // fpextend from f16 or bf16 to f32 is legal
786 // fpextend from bf16 to f64 needs to be split into two fpextends
789
790 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
791 for (auto Op : {
795 ISD::FADD,
796 ISD::FSUB,
797 ISD::FMUL,
798 ISD::FDIV,
799 ISD::FMA,
832 })
833 setOperationAction(Op, ScalarVT, Promote);
834
835 for (auto Op : {ISD::FNEG, ISD::FABS})
836 setOperationAction(Op, ScalarVT, Legal);
837
838 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
839 // because the result type is integer.
843 setOperationAction(Op, ScalarVT, Custom);
844
845 // promote v4f16 to v4f32 when that is known to be safe.
846 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
847 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
853 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
854 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
855 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
856 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
857 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
858 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
859 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
860
869
870 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
871 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
872 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
873
894 };
895
896 if (!Subtarget->hasFullFP16()) {
897 LegalizeNarrowFP(MVT::f16);
898 }
899 LegalizeNarrowFP(MVT::bf16);
902
903 // AArch64 has implementations of a lot of rounding-like FP operations.
904 // clang-format off
905 for (auto Op :
917 for (MVT Ty : {MVT::f32, MVT::f64})
919 if (Subtarget->hasFullFP16())
920 setOperationAction(Op, MVT::f16, Legal);
921 }
922 // clang-format on
923
924 // Basic strict FP operations are legal
927 for (MVT Ty : {MVT::f32, MVT::f64})
929 if (Subtarget->hasFullFP16())
930 setOperationAction(Op, MVT::f16, Legal);
931 }
932
934
940
942 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
945 } else {
948 }
951
952 // Generate outline atomics library calls only if LSE was not specified for
953 // subtarget
954 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
980 }
981
982 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
987
992
997
1002
1007 }
1008
1009 if (Subtarget->hasLSE128()) {
1010 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1011 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1015 }
1016
1017 // 128-bit loads and stores can be done without expanding
1018 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1020
1021 // Aligned 128-bit loads and stores are single-copy atomic according to the
1022 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1023 if (Subtarget->hasLSE2()) {
1026 }
1027
1028 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1029 // custom lowering, as there are no un-paired non-temporal stores and
1030 // legalization will break up 256 bit inputs.
1031 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1032 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1033 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1034 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1035 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1036 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1037 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1038 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1039
1040 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1041 // custom lowering, as there are no un-paired non-temporal loads legalization
1042 // will break up 256 bit inputs.
1043 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1044 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1045 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1046 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1047 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1048 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1049 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1050 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1051
1052 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1054
1055 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057 // Issue __sincos_stret if available.
1060 } else {
1063 }
1064
1065 // Make floating-point constants legal for the large code model, so they don't
1066 // become loads from the constant pool.
1067 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1070 }
1071
1072 // AArch64 does not have floating-point extending loads, i1 sign-extending
1073 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1074 for (MVT VT : MVT::fp_valuetypes()) {
1075 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1076 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1077 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1078 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1079 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1080 }
1081 for (MVT VT : MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1083
1084 for (MVT WideVT : MVT::fp_valuetypes()) {
1085 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1086 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1087 setTruncStoreAction(WideVT, NarrowVT, Expand);
1088 }
1089 }
1090 }
1091
1092 if (Subtarget->hasFPARMv8()) {
1096 }
1097
1098 // Indexed loads and stores are supported.
1099 for (unsigned im = (unsigned)ISD::PRE_INC;
1101 setIndexedLoadAction(im, MVT::i8, Legal);
1102 setIndexedLoadAction(im, MVT::i16, Legal);
1103 setIndexedLoadAction(im, MVT::i32, Legal);
1104 setIndexedLoadAction(im, MVT::i64, Legal);
1105 setIndexedLoadAction(im, MVT::f64, Legal);
1106 setIndexedLoadAction(im, MVT::f32, Legal);
1107 setIndexedLoadAction(im, MVT::f16, Legal);
1108 setIndexedLoadAction(im, MVT::bf16, Legal);
1109 setIndexedStoreAction(im, MVT::i8, Legal);
1110 setIndexedStoreAction(im, MVT::i16, Legal);
1111 setIndexedStoreAction(im, MVT::i32, Legal);
1112 setIndexedStoreAction(im, MVT::i64, Legal);
1113 setIndexedStoreAction(im, MVT::f64, Legal);
1114 setIndexedStoreAction(im, MVT::f32, Legal);
1115 setIndexedStoreAction(im, MVT::f16, Legal);
1116 setIndexedStoreAction(im, MVT::bf16, Legal);
1117 }
1118
1119 // Trap.
1120 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1123
1124 // We combine OR nodes for ccmp operations.
1126 // Try to create BICs for vector ANDs.
1128
1129 // llvm.init.trampoline and llvm.adjust.trampoline
1132
1133 // Vector add and sub nodes may conceal a high-half opportunity.
1134 // Also, try to fold ADD into CSINC/CSINV..
1137
1140
1141 // Try and combine setcc with csel
1143
1145
1153
1155
1157
1159
1163
1166
1168
1170
1172
1174
1178
1180
1182
1183 // In case of strict alignment, avoid an excessive number of byte wide stores.
1186 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1187
1191 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1192
1195 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1196
1199 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1200
1202
1204
1205 EnableExtLdPromotion = true;
1206
1207 // Set required alignment.
1209 // Set preferred alignments.
1210
1211 // Don't align loops on Windows. The SEH unwind info generation needs to
1212 // know the exact length of functions before the alignments have been
1213 // expanded.
1214 if (!Subtarget->isTargetWindows())
1218
1219 // Only change the limit for entries in a jump table if specified by
1220 // the sub target, but not at the command line.
1221 unsigned MaxJT = STI.getMaximumJumpTableSize();
1222 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1224
1226
1228
1230 if (Subtarget->hasSME())
1232
1233 if (Subtarget->isNeonAvailable()) {
1234 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1235 // silliness like this:
1236 // clang-format off
1237 for (auto Op :
1258 setOperationAction(Op, MVT::v1f64, Expand);
1259 // clang-format on
1260
1261 for (auto Op :
1266 setOperationAction(Op, MVT::v1i64, Expand);
1267
1268 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1269 // elements smaller than i32, so promote the input to i32 first.
1270 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1271 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1272
1273 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1274 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1275 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1278 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1280
1281 if (Subtarget->hasFullFP16()) {
1284
1293 } else {
1294 // when AArch64 doesn't have fullfp16 support, promote the input
1295 // to i32 first.
1296 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1297 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1298 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1299 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1300 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1301 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1302 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1303 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1304 }
1305
1306 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1307 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1314 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1319 }
1320
1321 // Custom handling for some quad-vector types to detect MULL.
1322 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1323 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1324 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1325 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1326 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1327 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1328
1329 // Saturates
1330 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1331 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1336 }
1337
1338 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1339 MVT::v4i32}) {
1346 }
1347
1348 // Vector reductions
1349 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1350 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1351 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1356
1358 }
1359 }
1360 if (Subtarget->hasFullFP16())
1362
1363 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1364 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1373 }
1378
1380 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1381 // Likewise, narrowing and extending vector loads/stores aren't handled
1382 // directly.
1385
1386 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1389 } else {
1392 }
1395
1398
1399 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1400 setTruncStoreAction(VT, InnerVT, Expand);
1401 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1403 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1404 }
1405 }
1406
1407 for (auto Op :
1413 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1415 if (Subtarget->hasFullFP16())
1416 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1418 }
1419
1420 // LRINT and LLRINT.
1421 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1422 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1424 if (Subtarget->hasFullFP16())
1425 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1427 }
1428
1429 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1430
1435
1439
1440 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1441 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1442 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1443 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1444 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1445 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1446
1447 // ADDP custom lowering
1448 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1450 // FADDP custom lowering
1451 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1453
1454 if (Subtarget->hasDotProd()) {
1455 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1457
1458 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1459 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1460 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1461
1462 if (Subtarget->hasMatMulInt8()) {
1464 MVT::v16i8, Legal);
1466 MVT::v16i8, Custom);
1467
1469 MVT::v8i8, Legal);
1470 }
1471 }
1472
1473 } else /* !isNeonAvailable */ {
1475 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1477
1478 if (VT.is128BitVector() || VT.is64BitVector()) {
1482 Subtarget->isLittleEndian() ? Legal : Expand);
1483 }
1484 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1485 setTruncStoreAction(VT, InnerVT, Expand);
1486 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1487 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1488 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1489 }
1490 }
1491 }
1492
1493 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1497 }
1498
1499 if (Subtarget->hasSME()) {
1501 }
1502
1503 // FIXME: Move lowering for more nodes here if those are common between
1504 // SVE and SME.
1505 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1506 for (auto VT :
1507 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1512 }
1513 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1516 }
1517
1518 if (Subtarget->hasSVE2p1() ||
1519 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1521
1522 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1524 }
1525
1526 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1527 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1568
1574
1583
1588
1589 if (!Subtarget->isLittleEndian())
1591
1592 if (Subtarget->hasSVE2() ||
1593 (Subtarget->hasSME() && Subtarget->isStreaming()))
1594 // For SLI/SRI.
1596 }
1597
1598 // Illegal unpacked integer vector types.
1599 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1602 }
1603
1604 // Type legalize unpacked bitcasts.
1605 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1607
1608 for (auto VT :
1609 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1610 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1612
1613 for (auto VT :
1614 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1622
1626
1627 // There are no legal MVT::nxv16f## based types.
1628 if (VT != MVT::nxv16i1) {
1633 }
1634 }
1635
1636 // NEON doesn't support masked loads/stores, but SME and SVE do.
1637 for (auto VT :
1638 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1639 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1640 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1643 }
1644
1645 // Firstly, exclude all scalable vector extending loads/truncating stores,
1646 // include both integer and floating scalable vector.
1648 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1649 setTruncStoreAction(VT, InnerVT, Expand);
1650 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1651 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1652 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1653 }
1654 }
1655
1656 // Then, selectively enable those which we directly support.
1657 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1658 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1659 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1660 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1661 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1662 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1663 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1664 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1665 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1666 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1667 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1668 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1669 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1670 }
1671
1672 // SVE supports truncating stores of 64 and 128-bit vectors
1673 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1674 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1675 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1676 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1677 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1678
1679 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1680 MVT::nxv4f32, MVT::nxv2f64}) {
1720
1742
1754 }
1755
1756 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1772
1773 if (Subtarget->hasSVEB16B16() &&
1774 Subtarget->isNonStreamingSVEorSME2Available()) {
1783 }
1784 }
1785
1786 for (auto Opcode :
1791 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1792 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1793 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1794 }
1795
1796 if (!Subtarget->hasSVEB16B16() ||
1797 !Subtarget->isNonStreamingSVEorSME2Available()) {
1798 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1800 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1801 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1802 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1803 }
1804 }
1805
1808
1809 // NEON doesn't support integer divides, but SVE does
1810 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1811 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1814 }
1815
1816 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1817 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1818 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1819
1820 // NOTE: Currently this has to happen after computeRegisterProperties rather
1821 // than the preferred option of combining it with the addRegisterClass call.
1822 if (Subtarget->useSVEForFixedLengthVectors()) {
1825 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1826 addTypeForFixedLengthSVE(VT);
1827 }
1830 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1831 addTypeForFixedLengthSVE(VT);
1832 }
1833
1834 // 64bit results can mean a bigger than NEON input.
1835 for (auto VT : {MVT::v8i8, MVT::v4i16})
1838
1839 // 128bit results imply a bigger than NEON input.
1840 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1842 for (auto VT : {MVT::v8f16, MVT::v4f32})
1844
1845 // These operations are not supported on NEON but SVE can do them.
1847 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1848 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1849 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1850 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1851 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1852 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1853 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1854 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1855 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1856 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1857 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1858 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1859 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1860 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1861 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1866
1867 // Int operations with no NEON support.
1868 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1869 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1877 }
1878
1879 // Use SVE for vectors with more than 2 elements.
1880 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1882 }
1883
1884 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1885 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1886 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1887 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1888
1890
1891 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1893 }
1894
1895 // Handle partial reduction operations
1896 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1897 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1898 // Other pairs will default to 'Expand'.
1899 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1901 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1902 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1903
1904 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1905
1906 if (Subtarget->hasMatMulInt8()) {
1908 MVT::nxv16i8, Legal);
1910 MVT::nxv16i8, Custom);
1911 }
1912
1913 // Wide add types
1914 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1915 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1916 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1917 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1918 }
1919 }
1920
1921 // Handle non-aliasing elements mask
1922 if (Subtarget->hasSVE2() ||
1923 (Subtarget->hasSME() && Subtarget->isStreaming())) {
1924 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
1925 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
1928 }
1929 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
1932 }
1933 }
1934
1935 // Handle operations that are only available in non-streaming SVE mode.
1936 if (Subtarget->isSVEAvailable()) {
1937 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1938 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1939 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1940 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1941 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1942 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1943 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1946 }
1947
1948 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1949 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1950 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1952
1953 // We can lower types that have <vscale x {2|4}> elements to compact.
1954 for (auto VT :
1955 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1956 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1958
1959 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1960 // NEON vectors in the lowest bits of the SVE register.
1961 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1962 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1964
1965 // Histcnt is SVE2 only
1966 if (Subtarget->hasSVE2()) {
1968 Custom);
1970 Custom);
1971
1972 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1974 // Must be lowered to SVE instructions.
1975 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
1976 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
1977 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1978 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
1979 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
1980 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
1981 }
1982 }
1983
1984 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1985 // Only required for llvm.aarch64.mops.memset.tag
1987 }
1988
1990
1991 if (Subtarget->hasSVE()) {
1996 }
1997
1998 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1999
2000 IsStrictFPEnabled = true;
2002
2003 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2004 // it, but it's just a wrapper around ldexp.
2005 if (Subtarget->isTargetWindows()) {
2007 if (isOperationExpand(Op, MVT::f32))
2008 setOperationAction(Op, MVT::f32, Promote);
2009 }
2010
2011 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2012 // isn't legal.
2014 if (isOperationExpand(Op, MVT::f16))
2015 setOperationAction(Op, MVT::f16, Promote);
2016}
2017
2019 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2020}
2021
2022void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2023 assert(VT.isVector() && "VT should be a vector type");
2024
2025 if (VT.isFloatingPoint()) {
2027 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2028 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2029 }
2030
2031 // Mark vector float intrinsics as expand.
2032 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2050 }
2051
2052 // But we do support custom-lowering for FCOPYSIGN.
2053 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2054 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2055 VT == MVT::v8f16) &&
2056 Subtarget->hasFullFP16()))
2058
2071
2075 for (MVT InnerVT : MVT::all_valuetypes())
2076 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2077
2078 // CNT supports only B element sizes, then use UADDLP to widen.
2079 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2081
2087
2088 for (unsigned Opcode :
2091 setOperationAction(Opcode, VT, Custom);
2092
2093 if (!VT.isFloatingPoint())
2095
2096 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2097 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2098 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2099 setOperationAction(Opcode, VT, Legal);
2100
2101 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2102 // NEON types.
2103 if (VT.isFloatingPoint() &&
2104 VT.getVectorElementType() != MVT::bf16 &&
2105 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2106 for (unsigned Opcode :
2112 setOperationAction(Opcode, VT, Legal);
2113
2114 // Strict fp extend and trunc are legal
2115 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2117 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2119
2120 // FIXME: We could potentially make use of the vector comparison instructions
2121 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2122 // complications:
2123 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2124 // so we would need to expand when the condition code doesn't match the
2125 // kind of comparison.
2126 // * Some kinds of comparison require more than one FCMXY instruction so
2127 // would need to be expanded instead.
2128 // * The lowering of the non-strict versions involves target-specific ISD
2129 // nodes so we would likely need to add strict versions of all of them and
2130 // handle them appropriately.
2133
2134 // When little-endian we can use ordinary d and q register loads/stores for
2135 // vector types, but when big-endian we need to use structure load/store which
2136 // only allow post-index addressing.
2137 if (Subtarget->isLittleEndian()) {
2138 for (unsigned im = (unsigned)ISD::PRE_INC;
2142 }
2143 } else {
2146 }
2147
2148 if (Subtarget->hasD128()) {
2151 }
2152
2153 if (VT.isInteger()) {
2154 // Let common code emit inverted variants of compares we do support.
2160 }
2161}
2162
2164 EVT OpVT) const {
2165 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2166 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2167 ResVT.getVectorElementType() != MVT::i1)
2168 return true;
2169
2170 // Only support illegal types if the result is scalable and min elements > 1.
2171 if (ResVT.getVectorMinNumElements() == 1 ||
2172 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2173 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2174 return true;
2175
2176 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2177 // but anything larger should be expanded.
2178 if (OpVT.getFixedSizeInBits() > 64)
2179 return true;
2180
2181 return false;
2182}
2183
2185 const IntrinsicInst *I) const {
2186 assert(I->getIntrinsicID() ==
2187 Intrinsic::experimental_vector_partial_reduce_add &&
2188 "Unexpected intrinsic!");
2189 return true;
2190}
2191
2193 if (!Subtarget->isSVEorStreamingSVEAvailable())
2194 return true;
2195
2196 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2197 // also support fixed-width predicates.
2198 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2199 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2200 VT != MVT::v4i1 && VT != MVT::v2i1;
2201}
2202
2204 unsigned SearchSize) const {
2205 // MATCH is SVE2 and only available in non-streaming mode.
2206 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2207 return true;
2208 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2209 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2210 return SearchSize != 8;
2211 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2212 return SearchSize != 8 && SearchSize != 16;
2213 return true;
2214}
2215
2216void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2217 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2218
2219 // By default everything must be expanded.
2220 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2222
2223 if (VT.isFloatingPoint()) {
2233 }
2234
2236 VT == MVT::v1f64 ? Expand : Custom;
2237
2238 // Mark integer truncating stores/extending loads as having custom lowering
2239 if (VT.isInteger()) {
2240 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2241 while (InnerVT != VT) {
2242 setTruncStoreAction(VT, InnerVT, Default);
2243 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2244 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2245 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2246 InnerVT = InnerVT.changeVectorElementType(
2247 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2248 }
2249 }
2250
2251 // Mark floating-point truncating stores/extending loads as having custom
2252 // lowering
2253 if (VT.isFloatingPoint()) {
2254 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2255 while (InnerVT != VT) {
2256 setTruncStoreAction(VT, InnerVT, Custom);
2257 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2258 InnerVT = InnerVT.changeVectorElementType(
2260 }
2261 }
2262
2263 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2264 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2265
2266 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2268 unsigned NumElts = VT.getVectorNumElements();
2269 if (VT.getVectorElementType() == MVT::i64) {
2270 setPartialReduceMLAAction(MLAOps, VT,
2271 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2272 setPartialReduceMLAAction(MLAOps, VT,
2273 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2274 setPartialReduceMLAAction(MLAOps, VT,
2275 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2276 } else if (VT.getVectorElementType() == MVT::i32) {
2277 setPartialReduceMLAAction(MLAOps, VT,
2278 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2279 setPartialReduceMLAAction(MLAOps, VT,
2280 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2281 } else if (VT.getVectorElementType() == MVT::i16) {
2282 setPartialReduceMLAAction(MLAOps, VT,
2283 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2284 }
2285 if (Subtarget->hasMatMulInt8()) {
2286 if (VT.getVectorElementType() == MVT::i32)
2288 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2289 else if (VT.getVectorElementType() == MVT::i64)
2291 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2292 }
2293
2294 // Lower fixed length vector operations to scalable equivalents.
2301 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2339 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2340 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2342 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2361 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2387}
2388
2389void AArch64TargetLowering::addDRType(MVT VT) {
2390 addRegisterClass(VT, &AArch64::FPR64RegClass);
2391 if (Subtarget->isNeonAvailable())
2392 addTypeForNEON(VT);
2393}
2394
2395void AArch64TargetLowering::addQRType(MVT VT) {
2396 addRegisterClass(VT, &AArch64::FPR128RegClass);
2397 if (Subtarget->isNeonAvailable())
2398 addTypeForNEON(VT);
2399}
2400
2402 LLVMContext &C, EVT VT) const {
2403 if (!VT.isVector())
2404 return MVT::i32;
2405 if (VT.isScalableVector())
2406 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2408}
2409
2410// isIntImmediate - This method tests to see if the node is a constant
2411// operand. If so Imm will receive the value.
2412static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2413 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2414 Imm = C->getZExtValue();
2415 return true;
2416 }
2417 return false;
2418}
2419
2420bool isVectorizedBinOp(unsigned Opcode) {
2421 switch (Opcode) {
2422 case AArch64ISD::SQDMULH:
2423 return true;
2424 default:
2425 return false;
2426 }
2427}
2428
2429// isOpcWithIntImmediate - This method tests to see if the node is a specific
2430// opcode and that it has a immediate integer right operand.
2431// If so Imm will receive the value.
2432static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2433 uint64_t &Imm) {
2434 return N->getOpcode() == Opc &&
2435 isIntImmediate(N->getOperand(1).getNode(), Imm);
2436}
2437
2438static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2439 const APInt &Demanded,
2441 unsigned NewOpc) {
2442 uint64_t OldImm = Imm, NewImm, Enc;
2443 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2444
2445 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2446 // bimm64.
2447 if (Imm == 0 || Imm == Mask ||
2449 return false;
2450
2451 unsigned EltSize = Size;
2452 uint64_t DemandedBits = Demanded.getZExtValue();
2453
2454 // Clear bits that are not demanded.
2455 Imm &= DemandedBits;
2456
2457 while (true) {
2458 // The goal here is to set the non-demanded bits in a way that minimizes
2459 // the number of switching between 0 and 1. In order to achieve this goal,
2460 // we set the non-demanded bits to the value of the preceding demanded bits.
2461 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2462 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2463 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2464 // The final result is 0b11000011.
2465 uint64_t NonDemandedBits = ~DemandedBits;
2466 uint64_t InvertedImm = ~Imm & DemandedBits;
2467 uint64_t RotatedImm =
2468 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2469 NonDemandedBits;
2470 uint64_t Sum = RotatedImm + NonDemandedBits;
2471 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2472 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2473 NewImm = (Imm | Ones) & Mask;
2474
2475 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2476 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2477 // we halve the element size and continue the search.
2478 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2479 break;
2480
2481 // We cannot shrink the element size any further if it is 2-bits.
2482 if (EltSize == 2)
2483 return false;
2484
2485 EltSize /= 2;
2486 Mask >>= EltSize;
2487 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2488
2489 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2490 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2491 return false;
2492
2493 // Merge the upper and lower halves of Imm and DemandedBits.
2494 Imm |= Hi;
2495 DemandedBits |= DemandedBitsHi;
2496 }
2497
2498 ++NumOptimizedImms;
2499
2500 // Replicate the element across the register width.
2501 while (EltSize < Size) {
2502 NewImm |= NewImm << EltSize;
2503 EltSize *= 2;
2504 }
2505
2506 (void)OldImm;
2507 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2508 "demanded bits should never be altered");
2509 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2510
2511 // Create the new constant immediate node.
2512 EVT VT = Op.getValueType();
2513 SDLoc DL(Op);
2514 SDValue New;
2515
2516 // If the new constant immediate is all-zeros or all-ones, let the target
2517 // independent DAG combine optimize this node.
2518 if (NewImm == 0 || NewImm == OrigMask) {
2519 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2520 TLO.DAG.getConstant(NewImm, DL, VT));
2521 // Otherwise, create a machine node so that target independent DAG combine
2522 // doesn't undo this optimization.
2523 } else {
2525 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2526 New = SDValue(
2527 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2528 }
2529
2530 return TLO.CombineTo(Op, New);
2531}
2532
2534 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2535 TargetLoweringOpt &TLO) const {
2536 // Delay this optimization to as late as possible.
2537 if (!TLO.LegalOps)
2538 return false;
2539
2541 return false;
2542
2543 EVT VT = Op.getValueType();
2544 if (VT.isVector())
2545 return false;
2546
2547 unsigned Size = VT.getSizeInBits();
2548
2549 if (Size != 32 && Size != 64)
2550 return false;
2551
2552 // Exit early if we demand all bits.
2553 if (DemandedBits.popcount() == Size)
2554 return false;
2555
2556 unsigned NewOpc;
2557 switch (Op.getOpcode()) {
2558 default:
2559 return false;
2560 case ISD::AND:
2561 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2562 break;
2563 case ISD::OR:
2564 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2565 break;
2566 case ISD::XOR:
2567 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2568 break;
2569 }
2570 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2571 if (!C)
2572 return false;
2573 uint64_t Imm = C->getZExtValue();
2574 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2575}
2576
2577/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2578/// Mask are known to be either zero or one and return them Known.
2580 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2581 const SelectionDAG &DAG, unsigned Depth) const {
2582 switch (Op.getOpcode()) {
2583 default:
2584 break;
2585 case AArch64ISD::DUP: {
2586 SDValue SrcOp = Op.getOperand(0);
2587 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2588 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2589 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2590 "Expected DUP implicit truncation");
2591 Known = Known.trunc(Op.getScalarValueSizeInBits());
2592 }
2593 break;
2594 }
2595 case AArch64ISD::CSEL: {
2596 KnownBits Known2;
2597 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2598 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2599 Known = Known.intersectWith(Known2);
2600 break;
2601 }
2602 case AArch64ISD::CSNEG:
2603 case AArch64ISD::CSINC:
2604 case AArch64ISD::CSINV: {
2605 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2606 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2607
2608 // The result is either:
2609 // CSINC: KnownOp0 or KnownOp1 + 1
2610 // CSINV: KnownOp0 or ~KnownOp1
2611 // CSNEG: KnownOp0 or KnownOp1 * -1
2612 if (Op.getOpcode() == AArch64ISD::CSINC)
2613 KnownOp1 = KnownBits::add(
2614 KnownOp1,
2615 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2616 else if (Op.getOpcode() == AArch64ISD::CSINV)
2617 std::swap(KnownOp1.Zero, KnownOp1.One);
2618 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2619 KnownOp1 =
2621 Op.getScalarValueSizeInBits())));
2622
2623 Known = KnownOp0.intersectWith(KnownOp1);
2624 break;
2625 }
2626 case AArch64ISD::BICi: {
2627 // Compute the bit cleared value.
2628 APInt Mask =
2629 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2630 .trunc(Known.getBitWidth());
2631 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2632 Known &= KnownBits::makeConstant(Mask);
2633 break;
2634 }
2635 case AArch64ISD::VLSHR: {
2636 KnownBits Known2;
2637 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2638 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2639 Known = KnownBits::lshr(Known, Known2);
2640 break;
2641 }
2642 case AArch64ISD::VASHR: {
2643 KnownBits Known2;
2644 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2645 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2646 Known = KnownBits::ashr(Known, Known2);
2647 break;
2648 }
2649 case AArch64ISD::VSHL: {
2650 KnownBits Known2;
2651 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2652 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2653 Known = KnownBits::shl(Known, Known2);
2654 break;
2655 }
2656 case AArch64ISD::MOVI: {
2658 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2659 break;
2660 }
2661 case AArch64ISD::MOVIshift: {
2663 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2664 << Op->getConstantOperandVal(1)));
2665 break;
2666 }
2667 case AArch64ISD::MOVImsl: {
2668 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2670 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2671 break;
2672 }
2673 case AArch64ISD::MOVIedit: {
2675 Known.getBitWidth(),
2676 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2677 break;
2678 }
2679 case AArch64ISD::MVNIshift: {
2681 APInt(Known.getBitWidth(),
2682 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2683 /*isSigned*/ false, /*implicitTrunc*/ true));
2684 break;
2685 }
2686 case AArch64ISD::MVNImsl: {
2687 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2689 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2690 /*isSigned*/ false, /*implicitTrunc*/ true));
2691 break;
2692 }
2693 case AArch64ISD::LOADgot:
2694 case AArch64ISD::ADDlow: {
2695 if (!Subtarget->isTargetILP32())
2696 break;
2697 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2698 Known.Zero = APInt::getHighBitsSet(64, 32);
2699 break;
2700 }
2701 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2702 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2703 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2704 break;
2705 }
2707 Intrinsic::ID IntID =
2708 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2709 switch (IntID) {
2710 default: return;
2711 case Intrinsic::aarch64_ldaxr:
2712 case Intrinsic::aarch64_ldxr: {
2713 unsigned BitWidth = Known.getBitWidth();
2714 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2715 unsigned MemBits = VT.getScalarSizeInBits();
2716 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2717 return;
2718 }
2719 }
2720 break;
2721 }
2723 case ISD::INTRINSIC_VOID: {
2724 unsigned IntNo = Op.getConstantOperandVal(0);
2725 switch (IntNo) {
2726 default:
2727 break;
2728 case Intrinsic::aarch64_neon_uaddlv: {
2729 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2730 unsigned BitWidth = Known.getBitWidth();
2731 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2732 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2733 assert(BitWidth >= Bound && "Unexpected width!");
2735 Known.Zero |= Mask;
2736 }
2737 break;
2738 }
2739 case Intrinsic::aarch64_neon_umaxv:
2740 case Intrinsic::aarch64_neon_uminv: {
2741 // Figure out the datatype of the vector operand. The UMINV instruction
2742 // will zero extend the result, so we can mark as known zero all the
2743 // bits larger than the element datatype. 32-bit or larget doesn't need
2744 // this as those are legal types and will be handled by isel directly.
2745 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2746 unsigned BitWidth = Known.getBitWidth();
2747 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2748 assert(BitWidth >= 8 && "Unexpected width!");
2750 Known.Zero |= Mask;
2751 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2752 assert(BitWidth >= 16 && "Unexpected width!");
2754 Known.Zero |= Mask;
2755 }
2756 break;
2757 } break;
2758 }
2759 }
2760 }
2761}
2762
2764 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2765 unsigned Depth) const {
2766 EVT VT = Op.getValueType();
2767 unsigned VTBits = VT.getScalarSizeInBits();
2768 unsigned Opcode = Op.getOpcode();
2769 switch (Opcode) {
2770 case AArch64ISD::FCMEQ:
2771 case AArch64ISD::FCMGE:
2772 case AArch64ISD::FCMGT:
2773 // Compares return either 0 or all-ones
2774 return VTBits;
2775 case AArch64ISD::VASHR: {
2776 unsigned Tmp =
2777 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2778 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2779 }
2780 }
2781
2782 return 1;
2783}
2784
2786 EVT) const {
2787 return MVT::i64;
2788}
2789
2791 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2792 unsigned *Fast) const {
2793
2794 // Allow SVE loads/stores where the alignment >= the size of the element type,
2795 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2796 // for stores that come from IR, only require element-size alignment (even if
2797 // unaligned accesses are disabled). Without this, these will be forced to
2798 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2799 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2800 if (VT.isScalableVector()) {
2801 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2802 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2803 return true;
2804 }
2805
2806 if (Subtarget->requiresStrictAlign())
2807 return false;
2808
2809 if (Fast) {
2810 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2811 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2812 // See comments in performSTORECombine() for more details about
2813 // these conditions.
2814
2815 // Code that uses clang vector extensions can mark that it
2816 // wants unaligned accesses to be treated as fast by
2817 // underspecifying alignment to be 1 or 2.
2818 Alignment <= 2 ||
2819
2820 // Disregard v2i64. Memcpy lowering produces those and splitting
2821 // them regresses performance on micro-benchmarks and olden/bh.
2822 VT == MVT::v2i64;
2823 }
2824 return true;
2825}
2826
2827// Same as above but handling LLTs instead.
2829 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2830 unsigned *Fast) const {
2831 if (Subtarget->requiresStrictAlign())
2832 return false;
2833
2834 if (Fast) {
2835 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2836 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2837 Ty.getSizeInBytes() != 16 ||
2838 // See comments in performSTORECombine() for more details about
2839 // these conditions.
2840
2841 // Code that uses clang vector extensions can mark that it
2842 // wants unaligned accesses to be treated as fast by
2843 // underspecifying alignment to be 1 or 2.
2844 Alignment <= 2 ||
2845
2846 // Disregard v2i64. Memcpy lowering produces those and splitting
2847 // them regresses performance on micro-benchmarks and olden/bh.
2848 Ty == LLT::fixed_vector(2, 64);
2849 }
2850 return true;
2851}
2852
2853FastISel *
2855 const TargetLibraryInfo *libInfo) const {
2856 return AArch64::createFastISel(funcInfo, libInfo);
2857}
2858
2861 MachineBasicBlock *MBB) const {
2862 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2863 // phi node:
2864
2865 // OrigBB:
2866 // [... previous instrs leading to comparison ...]
2867 // b.ne TrueBB
2868 // b EndBB
2869 // TrueBB:
2870 // ; Fallthrough
2871 // EndBB:
2872 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2873
2874 MachineFunction *MF = MBB->getParent();
2875 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2876 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2877 DebugLoc DL = MI.getDebugLoc();
2879
2880 Register DestReg = MI.getOperand(0).getReg();
2881 Register IfTrueReg = MI.getOperand(1).getReg();
2882 Register IfFalseReg = MI.getOperand(2).getReg();
2883 unsigned CondCode = MI.getOperand(3).getImm();
2884 bool NZCVKilled = MI.getOperand(4).isKill();
2885
2886 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2887 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2888 MF->insert(It, TrueBB);
2889 MF->insert(It, EndBB);
2890
2891 // Transfer rest of current basic-block to EndBB
2892 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2893 MBB->end());
2895
2896 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2897 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2898 MBB->addSuccessor(TrueBB);
2899 MBB->addSuccessor(EndBB);
2900
2901 // TrueBB falls through to the end.
2902 TrueBB->addSuccessor(EndBB);
2903
2904 if (!NZCVKilled) {
2905 TrueBB->addLiveIn(AArch64::NZCV);
2906 EndBB->addLiveIn(AArch64::NZCV);
2907 }
2908
2909 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2910 .addReg(IfTrueReg)
2911 .addMBB(TrueBB)
2912 .addReg(IfFalseReg)
2913 .addMBB(MBB);
2914
2915 MI.eraseFromParent();
2916 return EndBB;
2917}
2918
2920 MachineInstr &MI, MachineBasicBlock *BB) const {
2922 BB->getParent()->getFunction().getPersonalityFn())) &&
2923 "SEH does not use catchret!");
2924 return BB;
2925}
2926
2929 MachineBasicBlock *MBB) const {
2930 MachineFunction &MF = *MBB->getParent();
2931 MachineBasicBlock::iterator MBBI = MI.getIterator();
2932 const AArch64InstrInfo &TII =
2933 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2934 Register TargetReg = MI.getOperand(0).getReg();
2936 TII.probedStackAlloc(MBBI, TargetReg, false);
2937
2938 MI.eraseFromParent();
2939 return NextInst->getParent();
2940}
2941
2943AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2945 MachineBasicBlock *BB) const {
2946 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2947 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2948
2949 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2950 MIB.add(MI.getOperand(1)); // slice index register
2951 MIB.add(MI.getOperand(2)); // slice index offset
2952 MIB.add(MI.getOperand(3)); // pg
2953 MIB.add(MI.getOperand(4)); // base
2954 MIB.add(MI.getOperand(5)); // offset
2955
2956 MI.eraseFromParent(); // The pseudo is gone now.
2957 return BB;
2958}
2959
2962 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2964 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2965
2966 MIB.addReg(AArch64::ZA, RegState::Define);
2967 MIB.add(MI.getOperand(0)); // Vector select register
2968 MIB.add(MI.getOperand(1)); // Vector select offset
2969 MIB.add(MI.getOperand(2)); // Base
2970 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2971
2972 MI.eraseFromParent(); // The pseudo is gone now.
2973 return BB;
2974}
2975
2978 unsigned Opcode,
2979 bool Op0IsDef) const {
2980 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2982
2983 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2984 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2985 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2986 MIB.add(MI.getOperand(I));
2987
2988 MI.eraseFromParent(); // The pseudo is gone now.
2989 return BB;
2990}
2991
2993AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2995 MachineBasicBlock *BB) const {
2996 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2997 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2998 unsigned StartIdx = 0;
2999
3000 bool HasTile = BaseReg != AArch64::ZA;
3001 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3002 if (HasZPROut) {
3003 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3004 ++StartIdx;
3005 }
3006 if (HasTile) {
3007 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3008 RegState::Define); // Output ZA Tile
3009 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3010 StartIdx++;
3011 } else {
3012 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3013 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3014 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3015 ++StartIdx;
3016 }
3017 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3018 }
3019 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3020 MIB.add(MI.getOperand(I));
3021
3022 MI.eraseFromParent(); // The pseudo is gone now.
3023 return BB;
3024}
3025
3028 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3030 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3031 MIB.add(MI.getOperand(0)); // Mask
3032
3033 unsigned Mask = MI.getOperand(0).getImm();
3034 for (unsigned I = 0; I < 8; I++) {
3035 if (Mask & (1 << I))
3036 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3037 }
3038
3039 MI.eraseFromParent(); // The pseudo is gone now.
3040 return BB;
3041}
3042
3045 MachineBasicBlock *BB) const {
3046 MachineFunction *MF = BB->getParent();
3047 MachineFrameInfo &MFI = MF->getFrameInfo();
3049 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3050 if (TPIDR2.Uses > 0) {
3051 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3052 // generally don't support big-endian SVE/SME.
3053 if (!Subtarget->isLittleEndian())
3055 "TPIDR2 block initialization is not supported on big-endian targets");
3056
3057 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3058 // Store buffer pointer and num_za_save_slices.
3059 // Bytes 10-15 are implicitly zeroed.
3060 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3061 .addReg(MI.getOperand(0).getReg())
3062 .addReg(MI.getOperand(1).getReg())
3063 .addFrameIndex(TPIDR2.FrameIndex)
3064 .addImm(0);
3065 } else
3066 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3067
3068 BB->remove_instr(&MI);
3069 return BB;
3070}
3071
3074 MachineBasicBlock *BB) const {
3075 MachineFunction *MF = BB->getParent();
3076 MachineFrameInfo &MFI = MF->getFrameInfo();
3078 // TODO This function grows the stack with a subtraction, which doesn't work
3079 // on Windows. Some refactoring to share the functionality in
3080 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3081 // supports SME
3083 "Lazy ZA save is not yet supported on Windows");
3084
3085 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3086
3087 if (TPIDR2.Uses > 0) {
3088 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3090
3091 // The SUBXrs below won't always be emitted in a form that accepts SP
3092 // directly
3093 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3094 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3095 .addReg(AArch64::SP);
3096
3097 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3098 auto Size = MI.getOperand(1).getReg();
3099 auto Dest = MI.getOperand(0).getReg();
3100 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3101 .addReg(Size)
3102 .addReg(Size)
3103 .addReg(SP);
3104 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3105 AArch64::SP)
3106 .addReg(Dest);
3107
3108 // We have just allocated a variable sized object, tell this to PEI.
3109 MFI.CreateVariableSizedObject(Align(16), nullptr);
3110 }
3111
3112 BB->remove_instr(&MI);
3113 return BB;
3114}
3115
3116// TODO: Find a way to merge this with EmitAllocateZABuffer.
3119 MachineBasicBlock *BB) const {
3120 MachineFunction *MF = BB->getParent();
3121 MachineFrameInfo &MFI = MF->getFrameInfo();
3124 "Lazy ZA save is not yet supported on Windows");
3125
3126 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3127 if (FuncInfo->isSMESaveBufferUsed()) {
3128 // Allocate a buffer object of the size given by MI.getOperand(1).
3129 auto Size = MI.getOperand(1).getReg();
3130 auto Dest = MI.getOperand(0).getReg();
3131 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3132 .addReg(AArch64::SP)
3133 .addReg(Size)
3135 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3136 .addReg(AArch64::SP);
3137
3138 // We have just allocated a variable sized object, tell this to PEI.
3139 MFI.CreateVariableSizedObject(Align(16), nullptr);
3140 } else
3141 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3142 MI.getOperand(0).getReg());
3143
3144 BB->remove_instr(&MI);
3145 return BB;
3146}
3147
3150 MachineBasicBlock *BB) const {
3151 // If the buffer is used, emit a call to __arm_sme_state_size()
3152 MachineFunction *MF = BB->getParent();
3154 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3155 if (FuncInfo->isSMESaveBufferUsed()) {
3156 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3157 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3158 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3160 .addReg(AArch64::X0, RegState::ImplicitDefine)
3161 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3162 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3163 MI.getOperand(0).getReg())
3164 .addReg(AArch64::X0);
3165 } else
3166 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3167 MI.getOperand(0).getReg())
3168 .addReg(AArch64::XZR);
3169 BB->remove_instr(&MI);
3170 return BB;
3171}
3172
3175 MachineBasicBlock *BB) const {
3176 MachineFunction *MF = BB->getParent();
3178 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3179 Register ResultReg = MI.getOperand(0).getReg();
3180 if (FuncInfo->isPStateSMRegUsed()) {
3181 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3182 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3183 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3185 .addReg(AArch64::X0, RegState::ImplicitDefine)
3186 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3187 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
3188 .addReg(AArch64::X0);
3189 } else {
3190 assert(MI.getMF()->getRegInfo().use_empty(ResultReg) &&
3191 "Expected no users of the entry pstate.sm!");
3192 }
3193 MI.eraseFromParent();
3194 return BB;
3195}
3196
3197// Helper function to find the instruction that defined a virtual register.
3198// If unable to find such instruction, returns nullptr.
3200 Register Reg) {
3201 while (Reg.isVirtual()) {
3202 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3203 assert(DefMI && "Virtual register definition not found");
3204 unsigned Opcode = DefMI->getOpcode();
3205
3206 if (Opcode == AArch64::COPY) {
3207 Reg = DefMI->getOperand(1).getReg();
3208 // Vreg is defined by copying from physreg.
3209 if (Reg.isPhysical())
3210 return DefMI;
3211 continue;
3212 }
3213 if (Opcode == AArch64::SUBREG_TO_REG) {
3214 Reg = DefMI->getOperand(2).getReg();
3215 continue;
3216 }
3217
3218 return DefMI;
3219 }
3220 return nullptr;
3221}
3222
3225 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3226 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3227 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3228 const DebugLoc &DL = MI.getDebugLoc();
3229
3230 Register AddrDisc = AddrDiscOp.getReg();
3231 int64_t IntDisc = IntDiscOp.getImm();
3232 assert(IntDisc == 0 && "Blend components are already expanded");
3233
3234 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3235 if (DiscMI) {
3236 switch (DiscMI->getOpcode()) {
3237 case AArch64::MOVKXi:
3238 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3239 // #imm should be an immediate and not a global symbol, for example.
3240 if (DiscMI->getOperand(2).isImm() &&
3241 DiscMI->getOperand(3).getImm() == 48) {
3242 AddrDisc = DiscMI->getOperand(1).getReg();
3243 IntDisc = DiscMI->getOperand(2).getImm();
3244 }
3245 break;
3246 case AArch64::MOVi32imm:
3247 case AArch64::MOVi64imm:
3248 // Small immediate integer constant passed via VReg.
3249 if (DiscMI->getOperand(1).isImm() &&
3250 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3251 AddrDisc = AArch64::NoRegister;
3252 IntDisc = DiscMI->getOperand(1).getImm();
3253 }
3254 break;
3255 }
3256 }
3257
3258 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3259 // in the requested register class.
3260 if (AddrDisc == AArch64::XZR)
3261 AddrDisc = AArch64::NoRegister;
3262
3263 // Make sure AddrDisc operand respects the register class imposed by MI.
3264 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3265 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3266 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3267 AddrDisc = TmpReg;
3268 }
3269
3270 AddrDiscOp.setReg(AddrDisc);
3271 IntDiscOp.setImm(IntDisc);
3272}
3273
3275 MachineInstr &MI, MachineBasicBlock *BB) const {
3276
3277 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3278 if (SMEOrigInstr != -1) {
3279 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3280 uint64_t SMEMatrixType =
3281 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3282 switch (SMEMatrixType) {
3284 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3286 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3288 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3290 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3292 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3294 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3295 }
3296 }
3297
3298 switch (MI.getOpcode()) {
3299 default:
3300#ifndef NDEBUG
3301 MI.dump();
3302#endif
3303 llvm_unreachable("Unexpected instruction for custom inserter!");
3304 case AArch64::InitTPIDR2Obj:
3305 return EmitInitTPIDR2Object(MI, BB);
3306 case AArch64::AllocateZABuffer:
3307 return EmitAllocateZABuffer(MI, BB);
3308 case AArch64::AllocateSMESaveBuffer:
3309 return EmitAllocateSMESaveBuffer(MI, BB);
3310 case AArch64::GetSMESaveSize:
3311 return EmitGetSMESaveSize(MI, BB);
3312 case AArch64::EntryPStateSM:
3313 return EmitEntryPStateSM(MI, BB);
3314 case AArch64::F128CSEL:
3315 return EmitF128CSEL(MI, BB);
3316 case TargetOpcode::STATEPOINT:
3317 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3318 // while bl call instruction (where statepoint will be lowered at the end)
3319 // has implicit def. This def is early-clobber as it will be set at
3320 // the moment of the call and earlier than any use is read.
3321 // Add this implicit dead def here as a workaround.
3322 MI.addOperand(*MI.getMF(),
3324 AArch64::LR, /*isDef*/ true,
3325 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3326 /*isUndef*/ false, /*isEarlyClobber*/ true));
3327 [[fallthrough]];
3328 case TargetOpcode::STACKMAP:
3329 case TargetOpcode::PATCHPOINT:
3330 return emitPatchPoint(MI, BB);
3331
3332 case TargetOpcode::PATCHABLE_EVENT_CALL:
3333 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3334 return BB;
3335
3336 case AArch64::CATCHRET:
3337 return EmitLoweredCatchRet(MI, BB);
3338
3339 case AArch64::PROBED_STACKALLOC_DYN:
3340 return EmitDynamicProbedAlloc(MI, BB);
3341
3342 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3343 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3344 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3345 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3346 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3347 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3348 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3349 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3350 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3351 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3352 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3353 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3354 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3355 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3356 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3357 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3358 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3359 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3360 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3361 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3362 case AArch64::LDR_ZA_PSEUDO:
3363 return EmitFill(MI, BB);
3364 case AArch64::LDR_TX_PSEUDO:
3365 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3366 case AArch64::STR_TX_PSEUDO:
3367 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3368 case AArch64::ZERO_M_PSEUDO:
3369 return EmitZero(MI, BB);
3370 case AArch64::ZERO_T_PSEUDO:
3371 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3372 case AArch64::MOVT_TIZ_PSEUDO:
3373 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3374
3375 case AArch64::PAC:
3376 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3377 &AArch64::GPR64noipRegClass);
3378 return BB;
3379 }
3380}
3381
3382//===----------------------------------------------------------------------===//
3383// AArch64 Lowering private implementation.
3384//===----------------------------------------------------------------------===//
3385
3386//===----------------------------------------------------------------------===//
3387// Lowering Code
3388//===----------------------------------------------------------------------===//
3389
3390// Forward declarations of SVE fixed length lowering helpers
3395 SelectionDAG &DAG);
3398 EVT VT);
3399
3400/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3401static bool isZerosVector(const SDNode *N) {
3402 // Look through a bit convert.
3403 while (N->getOpcode() == ISD::BITCAST)
3404 N = N->getOperand(0).getNode();
3405
3407 return true;
3408
3409 if (N->getOpcode() != AArch64ISD::DUP)
3410 return false;
3411
3412 auto Opnd0 = N->getOperand(0);
3413 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3414}
3415
3416/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3417/// CC
3419 SDValue RHS = {}) {
3420 switch (CC) {
3421 default:
3422 llvm_unreachable("Unknown condition code!");
3423 case ISD::SETNE:
3424 return AArch64CC::NE;
3425 case ISD::SETEQ:
3426 return AArch64CC::EQ;
3427 case ISD::SETGT:
3428 return AArch64CC::GT;
3429 case ISD::SETGE:
3430 return (RHS && isNullConstant(RHS)) ? AArch64CC::PL : AArch64CC::GE;
3431 case ISD::SETLT:
3432 return (RHS && isNullConstant(RHS)) ? AArch64CC::MI : AArch64CC::LT;
3433 case ISD::SETLE:
3434 return AArch64CC::LE;
3435 case ISD::SETUGT:
3436 return AArch64CC::HI;
3437 case ISD::SETUGE:
3438 return AArch64CC::HS;
3439 case ISD::SETULT:
3440 return AArch64CC::LO;
3441 case ISD::SETULE:
3442 return AArch64CC::LS;
3443 }
3444}
3445
3446/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3448 AArch64CC::CondCode &CondCode,
3449 AArch64CC::CondCode &CondCode2) {
3450 CondCode2 = AArch64CC::AL;
3451 switch (CC) {
3452 default:
3453 llvm_unreachable("Unknown FP condition!");
3454 case ISD::SETEQ:
3455 case ISD::SETOEQ:
3456 CondCode = AArch64CC::EQ;
3457 break;
3458 case ISD::SETGT:
3459 case ISD::SETOGT:
3460 CondCode = AArch64CC::GT;
3461 break;
3462 case ISD::SETGE:
3463 case ISD::SETOGE:
3464 CondCode = AArch64CC::GE;
3465 break;
3466 case ISD::SETOLT:
3467 CondCode = AArch64CC::MI;
3468 break;
3469 case ISD::SETOLE:
3470 CondCode = AArch64CC::LS;
3471 break;
3472 case ISD::SETONE:
3473 CondCode = AArch64CC::MI;
3474 CondCode2 = AArch64CC::GT;
3475 break;
3476 case ISD::SETO:
3477 CondCode = AArch64CC::VC;
3478 break;
3479 case ISD::SETUO:
3480 CondCode = AArch64CC::VS;
3481 break;
3482 case ISD::SETUEQ:
3483 CondCode = AArch64CC::EQ;
3484 CondCode2 = AArch64CC::VS;
3485 break;
3486 case ISD::SETUGT:
3487 CondCode = AArch64CC::HI;
3488 break;
3489 case ISD::SETUGE:
3490 CondCode = AArch64CC::PL;
3491 break;
3492 case ISD::SETLT:
3493 case ISD::SETULT:
3494 CondCode = AArch64CC::LT;
3495 break;
3496 case ISD::SETLE:
3497 case ISD::SETULE:
3498 CondCode = AArch64CC::LE;
3499 break;
3500 case ISD::SETNE:
3501 case ISD::SETUNE:
3502 CondCode = AArch64CC::NE;
3503 break;
3504 }
3505}
3506
3507/// Convert a DAG fp condition code to an AArch64 CC.
3508/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3509/// should be AND'ed instead of OR'ed.
3511 AArch64CC::CondCode &CondCode,
3512 AArch64CC::CondCode &CondCode2) {
3513 CondCode2 = AArch64CC::AL;
3514 switch (CC) {
3515 default:
3516 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3517 assert(CondCode2 == AArch64CC::AL);
3518 break;
3519 case ISD::SETONE:
3520 // (a one b)
3521 // == ((a olt b) || (a ogt b))
3522 // == ((a ord b) && (a une b))
3523 CondCode = AArch64CC::VC;
3524 CondCode2 = AArch64CC::NE;
3525 break;
3526 case ISD::SETUEQ:
3527 // (a ueq b)
3528 // == ((a uno b) || (a oeq b))
3529 // == ((a ule b) && (a uge b))
3530 CondCode = AArch64CC::PL;
3531 CondCode2 = AArch64CC::LE;
3532 break;
3533 }
3534}
3535
3536/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3537/// CC usable with the vector instructions. Fewer operations are available
3538/// without a real NZCV register, so we have to use less efficient combinations
3539/// to get the same effect.
3541 AArch64CC::CondCode &CondCode,
3542 AArch64CC::CondCode &CondCode2,
3543 bool &Invert) {
3544 Invert = false;
3545 switch (CC) {
3546 default:
3547 // Mostly the scalar mappings work fine.
3548 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3549 break;
3550 case ISD::SETUO:
3551 Invert = true;
3552 [[fallthrough]];
3553 case ISD::SETO:
3554 CondCode = AArch64CC::MI;
3555 CondCode2 = AArch64CC::GE;
3556 break;
3557 case ISD::SETUEQ:
3558 case ISD::SETULT:
3559 case ISD::SETULE:
3560 case ISD::SETUGT:
3561 case ISD::SETUGE:
3562 // All of the compare-mask comparisons are ordered, but we can switch
3563 // between the two by a double inversion. E.g. ULE == !OGT.
3564 Invert = true;
3565 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3566 CondCode, CondCode2);
3567 break;
3568 }
3569}
3570
3571/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3573 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3574 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3575}
3576
3578 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3579 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3580 LLVM_DEBUG(dbgs() << "Is imm " << C
3581 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3582 return IsLegal;
3583}
3584
3586 // Works for negative immediates too, as it can be written as an ADDS
3587 // instruction with a negated immediate.
3588 return isLegalArithImmed(C.abs().getZExtValue());
3589}
3590
3592 uint64_t Imm = C.getZExtValue();
3594 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3595 return Insn.size();
3596}
3597
3599 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3600 if (Op->getFlags().hasNoSignedWrap())
3601 return true;
3602
3603 // We can still figure out if the second operand is safe to use
3604 // in a CMN instruction by checking if it is known to be not the minimum
3605 // signed value. If it is not, then we can safely use CMN.
3606 // Note: We can eventually remove this check and simply rely on
3607 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3608 // consistently sets them appropriately when making said nodes.
3609
3610 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3611 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3612}
3613
3614// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3615// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3616// can be set differently by this operation. It comes down to whether
3617// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3618// everything is fine. If not then the optimization is wrong. Thus general
3619// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3620//
3621// So, finally, the only LLVM-native comparisons that don't mention C or V
3622// are the ones that aren't unsigned comparisons. They're the only ones we can
3623// safely use CMN for in the absence of information about op2.
3625 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3626 (isIntEqualitySetCC(CC) ||
3627 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3628 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3629}
3630
3632 SelectionDAG &DAG, SDValue Chain,
3633 bool IsSignaling) {
3634 EVT VT = LHS.getValueType();
3635 assert(VT != MVT::f128);
3636
3637 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3638
3639 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3640 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3641 {Chain, LHS});
3642 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3643 {LHS.getValue(1), RHS});
3644 Chain = RHS.getValue(1);
3645 }
3646 unsigned Opcode =
3647 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3648 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3649}
3650
3652 const SDLoc &DL, SelectionDAG &DAG) {
3653 EVT VT = LHS.getValueType();
3654 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3655
3656 if (VT.isFloatingPoint()) {
3657 assert(VT != MVT::f128);
3658 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3659 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3660 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3661 }
3662 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3663 }
3664
3665 // The CMP instruction is just an alias for SUBS, and representing it as
3666 // SUBS means that it's possible to get CSE with subtract operations.
3667 // A later phase can perform the optimization of setting the destination
3668 // register to WZR/XZR if it ends up being unused.
3669 unsigned Opcode = AArch64ISD::SUBS;
3670
3671 if (isCMN(RHS, CC, DAG)) {
3672 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3673 Opcode = AArch64ISD::ADDS;
3674 RHS = RHS.getOperand(1);
3675 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3676 isIntEqualitySetCC(CC)) {
3677 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3678 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3679 Opcode = AArch64ISD::ADDS;
3680 LHS = LHS.getOperand(1);
3681 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3682 if (LHS.getOpcode() == ISD::AND) {
3683 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3684 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3685 // of the signed comparisons.
3686 const SDValue ANDSNode =
3687 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3688 LHS.getOperand(0), LHS.getOperand(1));
3689 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3690 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3691 return ANDSNode.getValue(1);
3692 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3693 // Use result of ANDS
3694 return LHS.getValue(1);
3695 }
3696 }
3697
3698 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3699 .getValue(1);
3700}
3701
3702/// \defgroup AArch64CCMP CMP;CCMP matching
3703///
3704/// These functions deal with the formation of CMP;CCMP;... sequences.
3705/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3706/// a comparison. They set the NZCV flags to a predefined value if their
3707/// predicate is false. This allows to express arbitrary conjunctions, for
3708/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3709/// expressed as:
3710/// cmp A
3711/// ccmp B, inv(CB), CA
3712/// check for CB flags
3713///
3714/// This naturally lets us implement chains of AND operations with SETCC
3715/// operands. And we can even implement some other situations by transforming
3716/// them:
3717/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3718/// negating the flags used in a CCMP/FCCMP operations.
3719/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3720/// by negating the flags we test for afterwards. i.e.
3721/// NEG (CMP CCMP CCCMP ...) can be implemented.
3722/// - Note that we can only ever negate all previously processed results.
3723/// What we can not implement by flipping the flags to test is a negation
3724/// of two sub-trees (because the negation affects all sub-trees emitted so
3725/// far, so the 2nd sub-tree we emit would also affect the first).
3726/// With those tools we can implement some OR operations:
3727/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3728/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3729/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3730/// elimination rules from earlier to implement the whole thing as a
3731/// CCMP/FCCMP chain.
3732///
3733/// As complete example:
3734/// or (or (setCA (cmp A)) (setCB (cmp B)))
3735/// (and (setCC (cmp C)) (setCD (cmp D)))"
3736/// can be reassociated to:
3737/// or (and (setCC (cmp C)) setCD (cmp D))
3738// (or (setCA (cmp A)) (setCB (cmp B)))
3739/// can be transformed to:
3740/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3741/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3742/// which can be implemented as:
3743/// cmp C
3744/// ccmp D, inv(CD), CC
3745/// ccmp A, CA, inv(CD)
3746/// ccmp B, CB, inv(CA)
3747/// check for CB flags
3748///
3749/// A counterexample is "or (and A B) (and C D)" which translates to
3750/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3751/// can only implement 1 of the inner (not) operations, but not both!
3752/// @{
3753
3754/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3756 ISD::CondCode CC, SDValue CCOp,
3758 AArch64CC::CondCode OutCC,
3759 const SDLoc &DL, SelectionDAG &DAG) {
3760 unsigned Opcode = 0;
3761 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3762
3763 if (LHS.getValueType().isFloatingPoint()) {
3764 assert(LHS.getValueType() != MVT::f128);
3765 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3766 LHS.getValueType() == MVT::bf16) {
3767 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3768 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3769 }
3770 Opcode = AArch64ISD::FCCMP;
3771 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3772 APInt Imm = Const->getAPIntValue();
3773 if (Imm.isNegative() && Imm.sgt(-32)) {
3774 Opcode = AArch64ISD::CCMN;
3775 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3776 }
3777 } else if (isCMN(RHS, CC, DAG)) {
3778 Opcode = AArch64ISD::CCMN;
3779 RHS = RHS.getOperand(1);
3780 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3781 isIntEqualitySetCC(CC)) {
3782 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3783 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3784 Opcode = AArch64ISD::CCMN;
3785 LHS = LHS.getOperand(1);
3786 }
3787 if (Opcode == 0)
3788 Opcode = AArch64ISD::CCMP;
3789
3790 SDValue Condition = getCondCode(DAG, Predicate);
3792 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3793 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3794 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3795}
3796
3797/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3798/// expressed as a conjunction. See \ref AArch64CCMP.
3799/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3800/// changing the conditions on the SETCC tests.
3801/// (this means we can call emitConjunctionRec() with
3802/// Negate==true on this sub-tree)
3803/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3804/// cannot do the negation naturally. We are required to
3805/// emit the subtree first in this case.
3806/// \param WillNegate Is true if are called when the result of this
3807/// subexpression must be negated. This happens when the
3808/// outer expression is an OR. We can use this fact to know
3809/// that we have a double negation (or (or ...) ...) that
3810/// can be implemented for free.
3811static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3812 bool &MustBeFirst, bool WillNegate,
3813 unsigned Depth = 0) {
3814 if (!Val.hasOneUse())
3815 return false;
3816 unsigned Opcode = Val->getOpcode();
3817 if (Opcode == ISD::SETCC) {
3818 if (Val->getOperand(0).getValueType() == MVT::f128)
3819 return false;
3820 CanNegate = true;
3821 MustBeFirst = false;
3822 return true;
3823 }
3824 // Protect against exponential runtime and stack overflow.
3825 if (Depth > 6)
3826 return false;
3827 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3828 bool IsOR = Opcode == ISD::OR;
3829 SDValue O0 = Val->getOperand(0);
3830 SDValue O1 = Val->getOperand(1);
3831 bool CanNegateL;
3832 bool MustBeFirstL;
3833 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3834 return false;
3835 bool CanNegateR;
3836 bool MustBeFirstR;
3837 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3838 return false;
3839
3840 if (MustBeFirstL && MustBeFirstR)
3841 return false;
3842
3843 if (IsOR) {
3844 // For an OR expression we need to be able to naturally negate at least
3845 // one side or we cannot do the transformation at all.
3846 if (!CanNegateL && !CanNegateR)
3847 return false;
3848 // If we the result of the OR will be negated and we can naturally negate
3849 // the leafs, then this sub-tree as a whole negates naturally.
3850 CanNegate = WillNegate && CanNegateL && CanNegateR;
3851 // If we cannot naturally negate the whole sub-tree, then this must be
3852 // emitted first.
3853 MustBeFirst = !CanNegate;
3854 } else {
3855 assert(Opcode == ISD::AND && "Must be OR or AND");
3856 // We cannot naturally negate an AND operation.
3857 CanNegate = false;
3858 MustBeFirst = MustBeFirstL || MustBeFirstR;
3859 }
3860 return true;
3861 }
3862 return false;
3863}
3864
3865/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3866/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3867/// Tries to transform the given i1 producing node @p Val to a series compare
3868/// and conditional compare operations. @returns an NZCV flags producing node
3869/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3870/// transformation was not possible.
3871/// \p Negate is true if we want this sub-tree being negated just by changing
3872/// SETCC conditions.
3874 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3876 // We're at a tree leaf, produce a conditional comparison operation.
3877 unsigned Opcode = Val->getOpcode();
3878 if (Opcode == ISD::SETCC) {
3879 SDValue LHS = Val->getOperand(0);
3880 SDValue RHS = Val->getOperand(1);
3881 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3882 bool isInteger = LHS.getValueType().isInteger();
3883 if (Negate)
3884 CC = getSetCCInverse(CC, LHS.getValueType());
3885 SDLoc DL(Val);
3886 // Determine OutCC and handle FP special case.
3887 if (isInteger) {
3888 OutCC = changeIntCCToAArch64CC(CC, RHS);
3889 } else {
3890 assert(LHS.getValueType().isFloatingPoint());
3891 AArch64CC::CondCode ExtraCC;
3892 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3893 // Some floating point conditions can't be tested with a single condition
3894 // code. Construct an additional comparison in this case.
3895 if (ExtraCC != AArch64CC::AL) {
3896 SDValue ExtraCmp;
3897 if (!CCOp.getNode())
3898 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3899 else
3900 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3901 ExtraCC, DL, DAG);
3902 CCOp = ExtraCmp;
3903 Predicate = ExtraCC;
3904 }
3905 }
3906
3907 // Produce a normal comparison if we are first in the chain
3908 if (!CCOp)
3909 return emitComparison(LHS, RHS, CC, DL, DAG);
3910 // Otherwise produce a ccmp.
3911 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3912 DAG);
3913 }
3914 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3915
3916 bool IsOR = Opcode == ISD::OR;
3917
3918 SDValue LHS = Val->getOperand(0);
3919 bool CanNegateL;
3920 bool MustBeFirstL;
3921 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3922 assert(ValidL && "Valid conjunction/disjunction tree");
3923 (void)ValidL;
3924
3925 SDValue RHS = Val->getOperand(1);
3926 bool CanNegateR;
3927 bool MustBeFirstR;
3928 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3929 assert(ValidR && "Valid conjunction/disjunction tree");
3930 (void)ValidR;
3931
3932 // Swap sub-tree that must come first to the right side.
3933 if (MustBeFirstL) {
3934 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3935 std::swap(LHS, RHS);
3936 std::swap(CanNegateL, CanNegateR);
3937 std::swap(MustBeFirstL, MustBeFirstR);
3938 }
3939
3940 bool NegateR;
3941 bool NegateAfterR;
3942 bool NegateL;
3943 bool NegateAfterAll;
3944 if (Opcode == ISD::OR) {
3945 // Swap the sub-tree that we can negate naturally to the left.
3946 if (!CanNegateL) {
3947 assert(CanNegateR && "at least one side must be negatable");
3948 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3949 assert(!Negate);
3950 std::swap(LHS, RHS);
3951 NegateR = false;
3952 NegateAfterR = true;
3953 } else {
3954 // Negate the left sub-tree if possible, otherwise negate the result.
3955 NegateR = CanNegateR;
3956 NegateAfterR = !CanNegateR;
3957 }
3958 NegateL = true;
3959 NegateAfterAll = !Negate;
3960 } else {
3961 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3962 assert(!Negate && "Valid conjunction/disjunction tree");
3963
3964 NegateL = false;
3965 NegateR = false;
3966 NegateAfterR = false;
3967 NegateAfterAll = false;
3968 }
3969
3970 // Emit sub-trees.
3971 AArch64CC::CondCode RHSCC;
3972 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3973 if (NegateAfterR)
3974 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3975 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3976 if (NegateAfterAll)
3977 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3978 return CmpL;
3979}
3980
3981/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3982/// In some cases this is even possible with OR operations in the expression.
3983/// See \ref AArch64CCMP.
3984/// \see emitConjunctionRec().
3986 AArch64CC::CondCode &OutCC) {
3987 bool DummyCanNegate;
3988 bool DummyMustBeFirst;
3989 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3990 return SDValue();
3991
3992 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3993}
3994
3995/// @}
3996
3997/// Returns how profitable it is to fold a comparison's operand's shift and/or
3998/// extension operations.
4000 auto isSupportedExtend = [&](SDValue V) {
4001 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4002 return true;
4003
4004 if (V.getOpcode() == ISD::AND)
4005 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4006 uint64_t Mask = MaskCst->getZExtValue();
4007 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4008 }
4009
4010 return false;
4011 };
4012
4013 if (!Op.hasOneUse())
4014 return 0;
4015
4016 if (isSupportedExtend(Op))
4017 return 1;
4018
4019 unsigned Opc = Op.getOpcode();
4020 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4021 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4022 uint64_t Shift = ShiftCst->getZExtValue();
4023 if (isSupportedExtend(Op.getOperand(0)))
4024 return (Shift <= 4) ? 2 : 1;
4025 EVT VT = Op.getValueType();
4026 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4027 return 1;
4028 }
4029
4030 return 0;
4031}
4032
4033// emitComparison() converts comparison with one or negative one to comparison
4034// with 0. Note that this only works for signed comparisons because of how ANDS
4035// works.
4037 // Only works for ANDS and AND.
4038 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4039 return false;
4040
4041 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4042 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4043 return true;
4044 }
4045
4046 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4047 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4048 return true;
4049 }
4050
4051 return false;
4052}
4053
4055 SDValue &AArch64cc, SelectionDAG &DAG,
4056 const SDLoc &DL) {
4057 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4058 EVT VT = RHS.getValueType();
4059 APInt C = RHSC->getAPIntValue();
4060 // shouldBeAdjustedToZero is a special case to better fold with
4061 // emitComparison().
4062 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4063 // Adjust the constant to zero.
4064 // CC has already been adjusted.
4065 RHS = DAG.getConstant(0, DL, VT);
4066 } else if (!isLegalCmpImmed(C)) {
4067 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4068 // Constant does not fit, try adjusting it by one?
4069 switch (CC) {
4070 default:
4071 break;
4072 case ISD::SETLT:
4073 case ISD::SETGE:
4074 if (!C.isMinSignedValue()) {
4075 APInt CMinusOne = C - 1;
4076 if (isLegalCmpImmed(CMinusOne) ||
4077 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4078 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4079 RHS = DAG.getConstant(CMinusOne, DL, VT);
4080 }
4081 }
4082 break;
4083 case ISD::SETULT:
4084 case ISD::SETUGE: {
4085 // C is not 0 because it is a legal immediate.
4086 assert(!C.isZero() && "C should not be zero here");
4087 APInt CMinusOne = C - 1;
4088 if (isLegalCmpImmed(CMinusOne) ||
4089 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4090 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4091 RHS = DAG.getConstant(CMinusOne, DL, VT);
4092 }
4093 break;
4094 }
4095 case ISD::SETLE:
4096 case ISD::SETGT:
4097 if (!C.isMaxSignedValue()) {
4098 APInt CPlusOne = C + 1;
4099 if (isLegalCmpImmed(CPlusOne) ||
4100 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4101 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4102 RHS = DAG.getConstant(CPlusOne, DL, VT);
4103 }
4104 }
4105 break;
4106 case ISD::SETULE:
4107 case ISD::SETUGT: {
4108 if (!C.isAllOnes()) {
4109 APInt CPlusOne = C + 1;
4110 if (isLegalCmpImmed(CPlusOne) ||
4111 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4112 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4113 RHS = DAG.getConstant(CPlusOne, DL, VT);
4114 }
4115 }
4116 break;
4117 }
4118 }
4119 }
4120 }
4121
4122 // Comparisons are canonicalized so that the RHS operand is simpler than the
4123 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4124 // can fold some shift+extend operations on the RHS operand, so swap the
4125 // operands if that can be done.
4126 //
4127 // For example:
4128 // lsl w13, w11, #1
4129 // cmp w13, w12
4130 // can be turned into:
4131 // cmp w12, w11, lsl #1
4132 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4133 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4134 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4135 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4136 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4137
4138 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4139 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4140 std::swap(LHS, RHS);
4142 }
4143 }
4144
4145 SDValue Cmp;
4146 AArch64CC::CondCode AArch64CC;
4147 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4148 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
4149
4150 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4151 // For the i8 operand, the largest immediate is 255, so this can be easily
4152 // encoded in the compare instruction. For the i16 operand, however, the
4153 // largest immediate cannot be encoded in the compare.
4154 // Therefore, use a sign extending load and cmn to avoid materializing the
4155 // -1 constant. For example,
4156 // movz w1, #65535
4157 // ldrh w0, [x0, #0]
4158 // cmp w0, w1
4159 // >
4160 // ldrsh w0, [x0, #0]
4161 // cmn w0, #1
4162 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4163 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4164 // ensure both the LHS and RHS are truly zero extended and to make sure the
4165 // transformation is profitable.
4166 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4167 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4168 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4169 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4170 int16_t ValueofRHS = RHS->getAsZExtVal();
4171 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4172 SDValue SExt =
4173 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4174 DAG.getValueType(MVT::i16));
4175 Cmp = emitComparison(
4176 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4177 DL, DAG);
4178 AArch64CC = changeIntCCToAArch64CC(CC);
4179 }
4180 }
4181
4182 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4183 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4184 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4185 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
4186 }
4187 }
4188 }
4189
4190 if (!Cmp) {
4191 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4192 AArch64CC = changeIntCCToAArch64CC(CC, RHS);
4193 }
4194 AArch64cc = getCondCode(DAG, AArch64CC);
4195 return Cmp;
4196}
4197
4198static std::pair<SDValue, SDValue>
4200 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4201 "Unsupported value type");
4202 SDValue Value, Overflow;
4203 SDLoc DL(Op);
4204 SDValue LHS = Op.getOperand(0);
4205 SDValue RHS = Op.getOperand(1);
4206 unsigned Opc = 0;
4207 switch (Op.getOpcode()) {
4208 default:
4209 llvm_unreachable("Unknown overflow instruction!");
4210 case ISD::SADDO:
4211 Opc = AArch64ISD::ADDS;
4212 CC = AArch64CC::VS;
4213 break;
4214 case ISD::UADDO:
4215 Opc = AArch64ISD::ADDS;
4216 CC = AArch64CC::HS;
4217 break;
4218 case ISD::SSUBO:
4219 Opc = AArch64ISD::SUBS;
4220 CC = AArch64CC::VS;
4221 break;
4222 case ISD::USUBO:
4223 Opc = AArch64ISD::SUBS;
4224 CC = AArch64CC::LO;
4225 break;
4226 // Multiply needs a little bit extra work.
4227 case ISD::SMULO:
4228 case ISD::UMULO: {
4229 CC = AArch64CC::NE;
4230 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4231 if (Op.getValueType() == MVT::i32) {
4232 // Extend to 64-bits, then perform a 64-bit multiply.
4233 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4234 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4235 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4236 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4237 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4238
4239 // Check that the result fits into a 32-bit integer.
4240 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4241 if (IsSigned) {
4242 // cmp xreg, wreg, sxtw
4243 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4244 Overflow =
4245 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4246 } else {
4247 // tst xreg, #0xffffffff00000000
4248 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4249 Overflow =
4250 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4251 }
4252 break;
4253 }
4254 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4255 // For the 64 bit multiply
4256 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4257 if (IsSigned) {
4258 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4259 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4260 DAG.getConstant(63, DL, MVT::i64));
4261 // It is important that LowerBits is last, otherwise the arithmetic
4262 // shift will not be folded into the compare (SUBS).
4263 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4264 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4265 .getValue(1);
4266 } else {
4267 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4268 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4269 Overflow =
4270 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4271 DAG.getConstant(0, DL, MVT::i64),
4272 UpperBits).getValue(1);
4273 }
4274 break;
4275 }
4276 } // switch (...)
4277
4278 if (Opc) {
4279 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4280
4281 // Emit the AArch64 operation with overflow check.
4282 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4283 Overflow = Value.getValue(1);
4284 }
4285 return std::make_pair(Value, Overflow);
4286}
4287
4288SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4289 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4290 !Subtarget->isNeonAvailable()))
4291 return LowerToScalableOp(Op, DAG);
4292
4293 SDValue Sel = Op.getOperand(0);
4294 SDValue Other = Op.getOperand(1);
4295 SDLoc DL(Sel);
4296
4297 // If the operand is an overflow checking operation, invert the condition
4298 // code and kill the Not operation. I.e., transform:
4299 // (xor (overflow_op_bool, 1))
4300 // -->
4301 // (csel 1, 0, invert(cc), overflow_op_bool)
4302 // ... which later gets transformed to just a cset instruction with an
4303 // inverted condition code, rather than a cset + eor sequence.
4305 // Only lower legal XALUO ops.
4307 return SDValue();
4308
4309 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4310 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4312 SDValue Value, Overflow;
4313 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4314 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4315 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4316 CCVal, Overflow);
4317 }
4318 // If neither operand is a SELECT_CC, give up.
4319 if (Sel.getOpcode() != ISD::SELECT_CC)
4320 std::swap(Sel, Other);
4321 if (Sel.getOpcode() != ISD::SELECT_CC)
4322 return Op;
4323
4324 // The folding we want to perform is:
4325 // (xor x, (select_cc a, b, cc, 0, -1) )
4326 // -->
4327 // (csel x, (xor x, -1), cc ...)
4328 //
4329 // The latter will get matched to a CSINV instruction.
4330
4331 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4332 SDValue LHS = Sel.getOperand(0);
4333 SDValue RHS = Sel.getOperand(1);
4334 SDValue TVal = Sel.getOperand(2);
4335 SDValue FVal = Sel.getOperand(3);
4336
4337 // FIXME: This could be generalized to non-integer comparisons.
4338 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4339 return Op;
4340
4341 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4342 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4343
4344 // The values aren't constants, this isn't the pattern we're looking for.
4345 if (!CFVal || !CTVal)
4346 return Op;
4347
4348 // We can commute the SELECT_CC by inverting the condition. This
4349 // might be needed to make this fit into a CSINV pattern.
4350 if (CTVal->isAllOnes() && CFVal->isZero()) {
4351 std::swap(TVal, FVal);
4352 std::swap(CTVal, CFVal);
4353 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4354 }
4355
4356 // If the constants line up, perform the transform!
4357 if (CTVal->isZero() && CFVal->isAllOnes()) {
4358 SDValue CCVal;
4359 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4360
4361 FVal = Other;
4362 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4363 DAG.getAllOnesConstant(DL, Other.getValueType()));
4364
4365 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4366 CCVal, Cmp);
4367 }
4368
4369 return Op;
4370}
4371
4372// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4373// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4374// sets 'C' bit to 0.
4376 SDLoc DL(Value);
4377 EVT VT = Value.getValueType();
4378 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4379 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4380 SDValue Cmp =
4381 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4382 return Cmp.getValue(1);
4383}
4384
4385// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4386// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4388 bool Invert) {
4389 assert(Glue.getResNo() == 1);
4390 SDLoc DL(Glue);
4391 SDValue Zero = DAG.getConstant(0, DL, VT);
4392 SDValue One = DAG.getConstant(1, DL, VT);
4394 SDValue CC = getCondCode(DAG, Cond);
4395 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4396}
4397
4398// Value is 1 if 'V' bit of NZCV is 1, else 0
4400 assert(Glue.getResNo() == 1);
4401 SDLoc DL(Glue);
4402 SDValue Zero = DAG.getConstant(0, DL, VT);
4403 SDValue One = DAG.getConstant(1, DL, VT);
4405 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4406}
4407
4408// This lowering is inefficient, but it will get cleaned up by
4409// `foldOverflowCheck`
4411 unsigned Opcode, bool IsSigned) {
4412 EVT VT0 = Op.getValue(0).getValueType();
4413 EVT VT1 = Op.getValue(1).getValueType();
4414
4415 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4416 return SDValue();
4417
4418 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4419 SDValue OpLHS = Op.getOperand(0);
4420 SDValue OpRHS = Op.getOperand(1);
4421 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4422
4423 SDLoc DL(Op);
4424
4425 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4426 OpRHS, OpCarryIn);
4427
4428 SDValue OutFlag =
4429 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4430 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4431
4432 return DAG.getMergeValues({Sum, OutFlag}, DL);
4433}
4434
4436 // Let legalize expand this if it isn't a legal type yet.
4437 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4438 return SDValue();
4439
4440 SDLoc DL(Op);
4442 // The actual operation that sets the overflow or carry flag.
4443 SDValue Value, Overflow;
4444 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4445
4446 // We use 0 and 1 as false and true values.
4447 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4448 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4449
4450 // We use an inverted condition, because the conditional select is inverted
4451 // too. This will allow it to be selected to a single instruction:
4452 // CSINC Wd, WZR, WZR, invert(cond).
4453 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4454 Overflow =
4455 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4456
4457 return DAG.getMergeValues({Value, Overflow}, DL);
4458}
4459
4460// Prefetch operands are:
4461// 1: Address to prefetch
4462// 2: bool isWrite
4463// 3: int locality (0 = no locality ... 3 = extreme locality)
4464// 4: bool isDataCache
4466 SDLoc DL(Op);
4467 unsigned IsWrite = Op.getConstantOperandVal(2);
4468 unsigned Locality = Op.getConstantOperandVal(3);
4469 unsigned IsData = Op.getConstantOperandVal(4);
4470
4471 bool IsStream = !Locality;
4472 // When the locality number is set
4473 if (Locality) {
4474 // The front-end should have filtered out the out-of-range values
4475 assert(Locality <= 3 && "Prefetch locality out-of-range");
4476 // The locality degree is the opposite of the cache speed.
4477 // Put the number the other way around.
4478 // The encoding starts at 0 for level 1
4479 Locality = 3 - Locality;
4480 }
4481
4482 // built the mask value encoding the expected behavior.
4483 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4484 (!IsData << 3) | // IsDataCache bit
4485 (Locality << 1) | // Cache level bits
4486 (unsigned)IsStream; // Stream bit
4487 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4488 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4489 Op.getOperand(1));
4490}
4491
4492// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4493// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4494// (AND X Y) Z which produces a better opt with EmitComparison
4496 SelectionDAG &DAG, const SDLoc DL) {
4497 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4498 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4499 ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4500 if (LHSConstOp && RHSConst) {
4501 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4502 uint64_t RHSConstant = RHSConst->getZExtValue();
4503 if (isPowerOf2_64(RHSConstant)) {
4504 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4505 LHS =
4506 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4507 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4508 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4509 CC = ISD::SETEQ;
4510 }
4511 }
4512 }
4513}
4514
4515SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4516 SelectionDAG &DAG) const {
4517 EVT VT = Op.getValueType();
4518 if (VT.isScalableVector()) {
4519 SDValue SrcVal = Op.getOperand(0);
4520
4521 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4522 // Break conversion in two with the first part converting to f32 and the
4523 // second using native f32->VT instructions.
4524 SDLoc DL(Op);
4525 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4526 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4527 }
4528
4529 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4530 }
4531
4532 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4533 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4534
4535 bool IsStrict = Op->isStrictFPOpcode();
4536 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4537 EVT Op0VT = Op0.getValueType();
4538 if (VT == MVT::f64) {
4539 // FP16->FP32 extends are legal for v32 and v4f32.
4540 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4541 return Op;
4542 // Split bf16->f64 extends into two fpextends.
4543 if (Op0VT == MVT::bf16 && IsStrict) {
4544 SDValue Ext1 =
4545 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4546 {Op0, Op.getOperand(0)});
4547 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4548 {Ext1, Ext1.getValue(1)});
4549 }
4550 if (Op0VT == MVT::bf16)
4551 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4552 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4553 return SDValue();
4554 }
4555
4556 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4557 return SDValue();
4558}
4559
4560SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4561 SelectionDAG &DAG) const {
4562 EVT VT = Op.getValueType();
4563 bool IsStrict = Op->isStrictFPOpcode();
4564 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4565 EVT SrcVT = SrcVal.getValueType();
4566 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4567
4568 if (VT.isScalableVector()) {
4569 // Let common code split the operation.
4570 if (SrcVT == MVT::nxv8f32)
4571 return Op;
4572
4573 if (VT.getScalarType() != MVT::bf16)
4574 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4575
4576 SDLoc DL(Op);
4577 constexpr EVT I32 = MVT::nxv4i32;
4578 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4579
4580 SDValue NaN;
4581 SDValue Narrow;
4582
4583 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4584 if (Subtarget->hasBF16())
4585 return LowerToPredicatedOp(Op, DAG,
4586 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4587
4588 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4589
4590 // Set the quiet bit.
4591 if (!DAG.isKnownNeverSNaN(SrcVal))
4592 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4593 } else if (SrcVT == MVT::nxv2f64 &&
4594 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4595 // Round to float without introducing rounding errors and try again.
4596 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4597 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4598 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4599
4601 if (IsStrict)
4602 NewOps.push_back(Op.getOperand(0));
4603 NewOps.push_back(Narrow);
4604 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4605 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4606 } else
4607 return SDValue();
4608
4609 if (!Trunc) {
4610 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4611 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4612 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4613 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4614 }
4615
4616 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4617 // 0x80000000.
4618 if (NaN) {
4619 EVT I1 = I32.changeElementType(MVT::i1);
4620 EVT CondVT = VT.changeElementType(MVT::i1);
4621 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4622 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4623 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4624 }
4625
4626 // Now that we have rounded, shift the bits into position.
4627 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4628 return getSVESafeBitCast(VT, Narrow, DAG);
4629 }
4630
4631 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4632 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4633
4634 // Expand cases where the result type is BF16 but we don't have hardware
4635 // instructions to lower it.
4636 if (VT.getScalarType() == MVT::bf16 &&
4637 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4638 Subtarget->hasBF16())) {
4639 SDLoc DL(Op);
4640 SDValue Narrow = SrcVal;
4641 SDValue NaN;
4642 EVT I32 = SrcVT.changeElementType(MVT::i32);
4643 EVT F32 = SrcVT.changeElementType(MVT::f32);
4644 if (SrcVT.getScalarType() == MVT::f32) {
4645 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4646 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4647 if (!NeverSNaN) {
4648 // Set the quiet bit.
4649 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4650 DAG.getConstant(0x400000, DL, I32));
4651 }
4652 } else if (SrcVT.getScalarType() == MVT::f64) {
4653 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4654 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4655 } else {
4656 return SDValue();
4657 }
4658 if (!Trunc) {
4659 SDValue One = DAG.getConstant(1, DL, I32);
4660 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4661 DAG.getShiftAmountConstant(16, I32, DL));
4662 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4663 SDValue RoundingBias =
4664 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4665 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4666 }
4667
4668 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4669 // 0x80000000.
4670 if (NaN) {
4671 SDValue IsNaN = DAG.getSetCC(
4672 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4673 SrcVal, SrcVal, ISD::SETUO);
4674 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4675 }
4676
4677 // Now that we have rounded, shift the bits into position.
4678 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4679 DAG.getShiftAmountConstant(16, I32, DL));
4680 if (VT.isVector()) {
4681 EVT I16 = I32.changeVectorElementType(MVT::i16);
4682 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4683 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4684 }
4685 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4686 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4687 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4688 : Result;
4689 }
4690
4691 if (SrcVT != MVT::f128) {
4692 // Expand cases where the input is a vector bigger than NEON.
4694 return SDValue();
4695
4696 // It's legal except when f128 is involved
4697 return Op;
4698 }
4699
4700 return SDValue();
4701}
4702
4703SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4704 SelectionDAG &DAG) const {
4705 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4706 // Any additional optimization in this function should be recorded
4707 // in the cost tables.
4708 bool IsStrict = Op->isStrictFPOpcode();
4709 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4710 EVT VT = Op.getValueType();
4711
4712 assert(!(IsStrict && VT.isScalableVector()) &&
4713 "Unimplemented SVE support for STRICT_FP_to_INT!");
4714
4715 // f16 conversions are promoted to f32 when full fp16 is not supported.
4716 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4717 InVT.getVectorElementType() == MVT::bf16) {
4718 EVT NewVT = VT.changeElementType(MVT::f32);
4719 SDLoc DL(Op);
4720 if (IsStrict) {
4721 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4722 {Op.getOperand(0), Op.getOperand(1)});
4723 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4724 {Ext.getValue(1), Ext.getValue(0)});
4725 }
4726 return DAG.getNode(
4727 Op.getOpcode(), DL, Op.getValueType(),
4728 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4729 }
4730
4731 if (VT.isScalableVector()) {
4732 if (VT.getVectorElementType() == MVT::i1) {
4733 SDLoc DL(Op);
4734 EVT CvtVT = getPromotedVTForPredicate(VT);
4735 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4736 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4737 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4738 }
4739
4740 // Let common code split the operation.
4741 if (InVT == MVT::nxv8f32)
4742 return Op;
4743
4744 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4745 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4746 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4747 return LowerToPredicatedOp(Op, DAG, Opcode);
4748 }
4749
4750 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4751 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4752 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4753
4754 uint64_t VTSize = VT.getFixedSizeInBits();
4755 uint64_t InVTSize = InVT.getFixedSizeInBits();
4756 if (VTSize < InVTSize) {
4757 SDLoc DL(Op);
4758 if (IsStrict) {
4760 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4761 {Op.getOperand(0), Op.getOperand(1)});
4762 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4763 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4764 }
4765 SDValue Cv =
4766 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4767 Op.getOperand(0));
4768 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4769 }
4770
4771 if (VTSize > InVTSize) {
4772 SDLoc DL(Op);
4773 MVT ExtVT =
4776 if (IsStrict) {
4777 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4778 {Op.getOperand(0), Op.getOperand(1)});
4779 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4780 {Ext.getValue(1), Ext.getValue(0)});
4781 }
4782 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4783 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4784 }
4785
4786 // Use a scalar operation for conversions between single-element vectors of
4787 // the same size.
4788 if (InVT.getVectorNumElements() == 1) {
4789 SDLoc DL(Op);
4790 SDValue Extract = DAG.getNode(
4792 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4793 EVT ScalarVT = VT.getScalarType();
4794 if (IsStrict)
4795 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4796 {Op.getOperand(0), Extract});
4797 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4798 }
4799
4800 // Type changing conversions are illegal.
4801 return Op;
4802}
4803
4804SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4805 SelectionDAG &DAG) const {
4806 bool IsStrict = Op->isStrictFPOpcode();
4807 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4808
4809 if (SrcVal.getValueType().isVector())
4810 return LowerVectorFP_TO_INT(Op, DAG);
4811
4812 // f16 conversions are promoted to f32 when full fp16 is not supported.
4813 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4814 SrcVal.getValueType() == MVT::bf16) {
4815 SDLoc DL(Op);
4816 if (IsStrict) {
4817 SDValue Ext =
4818 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4819 {Op.getOperand(0), SrcVal});
4820 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4821 {Ext.getValue(1), Ext.getValue(0)});
4822 }
4823 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4824 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4825 }
4826
4827 if (SrcVal.getValueType() != MVT::f128) {
4828 // It's legal except when f128 is involved
4829 return Op;
4830 }
4831
4832 return SDValue();
4833}
4834
4835SDValue
4836AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4837 SelectionDAG &DAG) const {
4838 // AArch64 FP-to-int conversions saturate to the destination element size, so
4839 // we can lower common saturating conversions to simple instructions.
4840 SDValue SrcVal = Op.getOperand(0);
4841 EVT SrcVT = SrcVal.getValueType();
4842 EVT DstVT = Op.getValueType();
4843 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4844
4845 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4846 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4847 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4848 assert(SatWidth <= DstElementWidth &&
4849 "Saturation width cannot exceed result width");
4850
4851 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4852 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4853 // types, so this is hard to reach.
4854 if (DstVT.isScalableVector())
4855 return SDValue();
4856
4857 EVT SrcElementVT = SrcVT.getVectorElementType();
4858
4859 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4860 SDLoc DL(Op);
4861 SDValue SrcVal2;
4862 if ((SrcElementVT == MVT::f16 &&
4863 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4864 SrcElementVT == MVT::bf16) {
4865 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4866 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4867 // If we are extending to a v8f32, split into two v4f32 to produce legal
4868 // types.
4869 if (F32VT.getSizeInBits() > 128) {
4870 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4871 F32VT = F32VT.getHalfNumVectorElementsVT();
4872 }
4873 SrcVT = F32VT;
4874 SrcElementVT = MVT::f32;
4875 SrcElementWidth = 32;
4876 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4877 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4878 return SDValue();
4879
4880 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4881 // width and produce a fcvtzu.
4882 if (SatWidth == 64 && SrcElementWidth < 64) {
4883 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4884 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4885 SrcVT = F64VT;
4886 SrcElementVT = MVT::f64;
4887 SrcElementWidth = 64;
4888 }
4889 // Cases that we can emit directly.
4890 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4891 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4892 DAG.getValueType(DstVT.getScalarType()));
4893 if (SrcVal2) {
4894 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4895 DAG.getValueType(DstVT.getScalarType()));
4896 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4897 }
4898 return Res;
4899 }
4900
4901 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4902 // result. This is only valid if the legal cvt is larger than the saturate
4903 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4904 // (at least until sqxtn is selected).
4905 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4906 return SDValue();
4907
4908 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4909 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4910 DAG.getValueType(IntVT.getScalarType()));
4911 SDValue NativeCvt2 =
4912 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4913 DAG.getValueType(IntVT.getScalarType()))
4914 : SDValue();
4915 SDValue Sat, Sat2;
4916 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4917 SDValue MinC = DAG.getConstant(
4918 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4919 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4920 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4921 SDValue MaxC = DAG.getConstant(
4922 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4923 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4924 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4925 } else {
4926 SDValue MinC = DAG.getConstant(
4927 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4928 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4929 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4930 }
4931
4932 if (SrcVal2)
4933 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4935 Sat, Sat2);
4936
4937 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4938}
4939
4940SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4941 SelectionDAG &DAG) const {
4942 // AArch64 FP-to-int conversions saturate to the destination register size, so
4943 // we can lower common saturating conversions to simple instructions.
4944 SDValue SrcVal = Op.getOperand(0);
4945 EVT SrcVT = SrcVal.getValueType();
4946
4947 if (SrcVT.isVector())
4948 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4949
4950 EVT DstVT = Op.getValueType();
4951 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4952 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4953 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4954 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4955
4956 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4957 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4958 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4959 SrcVT = MVT::f32;
4960 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4961 SrcVT != MVT::bf16)
4962 return SDValue();
4963
4964 SDLoc DL(Op);
4965 // Cases that we can emit directly.
4966 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4967 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4968 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4969 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4970 DAG.getValueType(DstVT));
4971
4972 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4973 // result. This is only valid if the legal cvt is larger than the saturate
4974 // width.
4975 if (DstWidth < SatWidth)
4976 return SDValue();
4977
4978 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
4979 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4980 SDValue CVTf32 =
4981 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
4982 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
4983 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
4984 DAG.getValueType(SatVT));
4985 }
4986 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
4987 return DAG.getBitcast(DstVT, CVTf32);
4988 }
4989
4990 SDValue NativeCvt =
4991 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4992 SDValue Sat;
4993 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4994 SDValue MinC = DAG.getConstant(
4995 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4996 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4997 SDValue MaxC = DAG.getConstant(
4998 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4999 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5000 } else {
5001 SDValue MinC = DAG.getConstant(
5002 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5003 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5004 }
5005
5006 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5007}
5008
5009SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5010 SelectionDAG &DAG) const {
5011 EVT VT = Op.getValueType();
5012 SDValue Src = Op.getOperand(0);
5013 SDLoc DL(Op);
5014
5015 assert(VT.isVector() && "Expected vector type");
5016
5017 EVT CastVT =
5018 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
5019
5020 // Round the floating-point value into a floating-point register with the
5021 // current rounding mode.
5022 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5023
5024 // Truncate the rounded floating point to an integer.
5025 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5027}
5028
5029SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5030 SelectionDAG &DAG) const {
5031 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5032 // Any additional optimization in this function should be recorded
5033 // in the cost tables.
5034 bool IsStrict = Op->isStrictFPOpcode();
5035 EVT VT = Op.getValueType();
5036 SDLoc DL(Op);
5037 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5038 EVT InVT = In.getValueType();
5039 unsigned Opc = Op.getOpcode();
5040 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5041
5042 assert(!(IsStrict && VT.isScalableVector()) &&
5043 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5044
5045 // NOTE: i1->bf16 does not require promotion to f32.
5046 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5047 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5048 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5049 : DAG.getConstantFP(1.0, DL, VT);
5050 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5051 }
5052
5053 // Promote bf16 conversions to f32.
5054 if (VT.getVectorElementType() == MVT::bf16) {
5055 EVT F32 = VT.changeElementType(MVT::f32);
5056 if (IsStrict) {
5057 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5058 {Op.getOperand(0), In});
5059 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5060 {Op.getValueType(), MVT::Other},
5061 {Val.getValue(1), Val.getValue(0),
5062 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5063 }
5064 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5065 DAG.getNode(Op.getOpcode(), DL, F32, In),
5066 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5067 }
5068
5069 if (VT.isScalableVector()) {
5070 // Let common code split the operation.
5071 if (VT == MVT::nxv8f32)
5072 return Op;
5073
5074 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5075 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5076 return LowerToPredicatedOp(Op, DAG, Opcode);
5077 }
5078
5079 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5080 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5081 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5082
5083 uint64_t VTSize = VT.getFixedSizeInBits();
5084 uint64_t InVTSize = InVT.getFixedSizeInBits();
5085 if (VTSize < InVTSize) {
5086 // AArch64 doesn't have a direct vector instruction to convert
5087 // fixed point to floating point AND narrow it at the same time.
5088 // Additional rounding when the target is f32/f64 causes double
5089 // rounding issues. Conversion to f16 is fine due to narrow width.
5090 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5091 bool IsTargetf16 = false;
5092 if (Op.hasOneUse() &&
5093 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5094 // Some vector types are split during legalization into half, followed by
5095 // concatenation, followed by rounding to the original vector type. If we
5096 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5097 SDNode *U = *Op->user_begin();
5098 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5099 EVT TmpVT = U->user_begin()->getValueType(0);
5100 if (TmpVT.getScalarType() == MVT::f16)
5101 IsTargetf16 = true;
5102 }
5103 }
5104
5105 if (IsTargetf32 && !IsTargetf16) {
5106 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5107 }
5108
5109 MVT CastVT =
5111 InVT.getVectorNumElements());
5112 if (IsStrict) {
5113 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5114 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5115 {In.getValue(1), In.getValue(0),
5116 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5117 }
5118 In = DAG.getNode(Opc, DL, CastVT, In);
5119 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5120 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5121 }
5122
5123 if (VTSize > InVTSize) {
5124 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5126 In = DAG.getNode(CastOpc, DL, CastVT, In);
5127 if (IsStrict)
5128 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5129 return DAG.getNode(Opc, DL, VT, In);
5130 }
5131
5132 // Use a scalar operation for conversions between single-element vectors of
5133 // the same size.
5134 if (VT.getVectorNumElements() == 1) {
5135 SDValue Extract =
5137 DAG.getConstant(0, DL, MVT::i64));
5138 EVT ScalarVT = VT.getScalarType();
5139 if (IsStrict)
5140 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5141 {Op.getOperand(0), Extract});
5142 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5143 }
5144
5145 return Op;
5146}
5147
5148SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5149 SelectionDAG &DAG) const {
5150 if (Op.getValueType().isVector())
5151 return LowerVectorINT_TO_FP(Op, DAG);
5152
5153 bool IsStrict = Op->isStrictFPOpcode();
5154 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5155
5156 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5157 Op->getOpcode() == ISD::SINT_TO_FP;
5158
5159 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5160 SDLoc DL(Op);
5161 if (IsStrict) {
5162 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5163 {Op.getOperand(0), SrcVal});
5164 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5165 {Op.getValueType(), MVT::Other},
5166 {Val.getValue(1), Val.getValue(0),
5167 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5168 }
5169 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5170 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5171 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5172 };
5173
5174 if (Op.getValueType() == MVT::bf16) {
5175 unsigned MaxWidth = IsSigned
5176 ? DAG.ComputeMaxSignificantBits(SrcVal)
5177 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5178 // bf16 conversions are promoted to f32 when converting from i16.
5179 if (MaxWidth <= 24) {
5180 return IntToFpViaPromotion(MVT::f32);
5181 }
5182
5183 // bf16 conversions are promoted to f64 when converting from i32.
5184 if (MaxWidth <= 53) {
5185 return IntToFpViaPromotion(MVT::f64);
5186 }
5187
5188 // We need to be careful about i64 -> bf16.
5189 // Consider an i32 22216703.
5190 // This number cannot be represented exactly as an f32 and so a itofp will
5191 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5192 // However, the correct bf16 was supposed to be 22151168.0
5193 // We need to use sticky rounding to get this correct.
5194 if (SrcVal.getValueType() == MVT::i64) {
5195 SDLoc DL(Op);
5196 // This algorithm is equivalent to the following:
5197 // uint64_t SrcHi = SrcVal & ~0xfffull;
5198 // uint64_t SrcLo = SrcVal & 0xfffull;
5199 // uint64_t Highest = SrcVal >> 53;
5200 // bool HasHighest = Highest != 0;
5201 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5202 // double Rounded = static_cast<double>(ToRound);
5203 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5204 // uint64_t HasLo = SrcLo != 0;
5205 // bool NeedsAdjustment = HasHighest & HasLo;
5206 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5207 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5208 // return static_cast<__bf16>(Adjusted);
5209 //
5210 // Essentially, what happens is that SrcVal either fits perfectly in a
5211 // double-precision value or it is too big. If it is sufficiently small,
5212 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5213 // ensure that u64 -> double has no rounding error by only using the 52
5214 // MSB of the input. The low order bits will get merged into a sticky bit
5215 // which will avoid issues incurred by double rounding.
5216
5217 // Signed conversion is more or less like so:
5218 // copysign((__bf16)abs(SrcVal), SrcVal)
5219 SDValue SignBit;
5220 if (IsSigned) {
5221 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5222 DAG.getConstant(1ull << 63, DL, MVT::i64));
5223 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5224 }
5225 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5226 DAG.getConstant(~0xfffull, DL, MVT::i64));
5227 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5228 DAG.getConstant(0xfffull, DL, MVT::i64));
5230 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5231 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5232 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5233 SDValue ToRound =
5234 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5235 SDValue Rounded =
5236 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5237 {Op.getOperand(0), ToRound})
5238 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5239
5240 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5241 if (SignBit) {
5242 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5243 }
5244
5245 SDValue HasHighest = DAG.getSetCC(
5246 DL,
5247 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5248 Highest, Zero64, ISD::SETNE);
5249
5250 SDValue HasLo = DAG.getSetCC(
5251 DL,
5252 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5253 SrcLo, Zero64, ISD::SETNE);
5254
5255 SDValue NeedsAdjustment =
5256 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5257 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5258
5259 SDValue AdjustedBits =
5260 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5261 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5262 return IsStrict
5263 ? DAG.getNode(
5265 {Op.getValueType(), MVT::Other},
5266 {Rounded.getValue(1), Adjusted,
5267 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5268 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5269 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5270 }
5271 }
5272
5273 // f16 conversions are promoted to f32 when full fp16 is not supported.
5274 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5275 return IntToFpViaPromotion(MVT::f32);
5276 }
5277
5278 // i128 conversions are libcalls.
5279 if (SrcVal.getValueType() == MVT::i128)
5280 return SDValue();
5281
5282 // Other conversions are legal, unless it's to the completely software-based
5283 // fp128.
5284 if (Op.getValueType() != MVT::f128)
5285 return Op;
5286 return SDValue();
5287}
5288
5289SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5290 SelectionDAG &DAG) const {
5291 // For iOS, we want to call an alternative entry point: __sincos_stret,
5292 // which returns the values in two S / D registers.
5293 SDLoc DL(Op);
5294 SDValue Arg = Op.getOperand(0);
5295 EVT ArgVT = Arg.getValueType();
5296 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5297
5299 Args.emplace_back(Arg, ArgTy);
5300
5301 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5302 : RTLIB::SINCOS_STRET_F32;
5303 const char *LibcallName = getLibcallName(LC);
5304 SDValue Callee =
5305 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5306
5307 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5310 CLI.setDebugLoc(DL)
5311 .setChain(DAG.getEntryNode())
5312 .setLibCallee(CC, RetTy, Callee, std::move(Args));
5313
5314 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5315 return CallResult.first;
5316}
5317
5318static MVT getSVEContainerType(EVT ContentTy);
5319
5320SDValue
5321AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5322 SelectionDAG &DAG) const {
5323 SDLoc DL(Op);
5324 uint64_t EltSize = Op.getConstantOperandVal(2);
5325 EVT VT = Op.getValueType();
5326 switch (EltSize) {
5327 case 1:
5328 if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
5329 return SDValue();
5330 break;
5331 case 2:
5332 if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
5333 return SDValue();
5334 break;
5335 case 4:
5336 if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
5337 return SDValue();
5338 break;
5339 case 8:
5340 if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
5341 return SDValue();
5342 break;
5343 default:
5344 // Other element sizes are incompatible with whilewr/rw, so expand instead
5345 return SDValue();
5346 }
5347
5348 SDValue PtrA = Op.getOperand(0);
5349 SDValue PtrB = Op.getOperand(1);
5350
5351 if (VT.isScalableVT())
5352 return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
5353
5354 // We can use the SVE whilewr/whilerw instruction to lower this
5355 // intrinsic by creating the appropriate sequence of scalable vector
5356 // operations and then extracting a fixed-width subvector from the scalable
5357 // vector. Scalable vector variants are already legal.
5358 EVT ContainerVT =
5360 VT.getVectorNumElements(), true);
5361 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5362
5363 SDValue Mask =
5364 DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
5365 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
5366 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
5367 DAG.getVectorIdxConstant(0, DL));
5368}
5369
5370SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5371 SelectionDAG &DAG) const {
5372 EVT OpVT = Op.getValueType();
5373 EVT ArgVT = Op.getOperand(0).getValueType();
5374
5376 return LowerFixedLengthBitcastToSVE(Op, DAG);
5377
5378 if (OpVT.isScalableVector()) {
5379 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5380
5381 // Handle type legalisation first.
5382 if (!isTypeLegal(ArgVT)) {
5383 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5384 "Expected int->fp bitcast!");
5385
5386 // Bitcasting between unpacked vector types of different element counts is
5387 // not a NOP because the live elements are laid out differently.
5388 // 01234567
5389 // e.g. nxv2i32 = XX??XX??
5390 // nxv4f16 = X?X?X?X?
5391 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5392 return SDValue();
5393
5394 SDValue ExtResult =
5396 Op.getOperand(0));
5397 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5398 }
5399
5400 // Bitcasts between legal types with the same element count are legal.
5401 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5402 return Op;
5403
5404 // getSVESafeBitCast does not support casting between unpacked types.
5405 if (!isPackedVectorType(OpVT, DAG))
5406 return SDValue();
5407
5408 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5409 }
5410
5411 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5412 return SDValue();
5413
5414 // Bitcasts between f16 and bf16 are legal.
5415 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5416 return Op;
5417
5418 assert(ArgVT == MVT::i16);
5419 SDLoc DL(Op);
5420
5421 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5422 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5423 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5424}
5425
5426// Returns lane if Op extracts from a two-element vector and lane is constant
5427// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5428static std::optional<uint64_t>
5430 SDNode *OpNode = Op.getNode();
5431 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5432 return std::nullopt;
5433
5434 EVT VT = OpNode->getOperand(0).getValueType();
5435 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
5436 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5437 return std::nullopt;
5438
5439 return C->getZExtValue();
5440}
5441
5443 bool isSigned) {
5444 EVT VT = N.getValueType();
5445
5446 if (N.getOpcode() != ISD::BUILD_VECTOR)
5447 return false;
5448
5449 for (const SDValue &Elt : N->op_values()) {
5450 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5451 unsigned EltSize = VT.getScalarSizeInBits();
5452 unsigned HalfSize = EltSize / 2;
5453 if (isSigned) {
5454 if (!isIntN(HalfSize, C->getSExtValue()))
5455 return false;
5456 } else {
5457 if (!isUIntN(HalfSize, C->getZExtValue()))
5458 return false;
5459 }
5460 continue;
5461 }
5462 return false;
5463 }
5464
5465 return true;
5466}
5467
5469 EVT VT = N.getValueType();
5470 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5471 EVT HalfVT = EVT::getVectorVT(
5472 *DAG.getContext(),
5475 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5476}
5477
5479 return N.getOpcode() == ISD::SIGN_EXTEND ||
5480 N.getOpcode() == ISD::ANY_EXTEND ||
5481 isExtendedBUILD_VECTOR(N, DAG, true);
5482}
5483
5485 return N.getOpcode() == ISD::ZERO_EXTEND ||
5486 N.getOpcode() == ISD::ANY_EXTEND ||
5487 isExtendedBUILD_VECTOR(N, DAG, false);
5488}
5489
5491 unsigned Opcode = N.getOpcode();
5492 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5493 SDValue N0 = N.getOperand(0);
5494 SDValue N1 = N.getOperand(1);
5495 return N0->hasOneUse() && N1->hasOneUse() &&
5496 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5497 }
5498 return false;
5499}
5500
5502 unsigned Opcode = N.getOpcode();
5503 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5504 SDValue N0 = N.getOperand(0);
5505 SDValue N1 = N.getOperand(1);
5506 return N0->hasOneUse() && N1->hasOneUse() &&
5507 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5508 }
5509 return false;
5510}
5511
5512SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5513 SelectionDAG &DAG) const {
5514 // The rounding mode is in bits 23:22 of the FPSCR.
5515 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5516 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5517 // so that the shift + and get folded into a bitfield extract.
5518 SDLoc DL(Op);
5519
5520 SDValue Chain = Op.getOperand(0);
5521 SDValue FPCR_64 = DAG.getNode(
5522 ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5523 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});
5524 Chain = FPCR_64.getValue(1);
5525 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5526 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5527 DAG.getConstant(1U << 22, DL, MVT::i32));
5528 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5529 DAG.getConstant(22, DL, MVT::i32));
5530 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5531 DAG.getConstant(3, DL, MVT::i32));
5532 return DAG.getMergeValues({AND, Chain}, DL);
5533}
5534
5535SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5536 SelectionDAG &DAG) const {
5537 SDLoc DL(Op);
5538 SDValue Chain = Op->getOperand(0);
5539 SDValue RMValue = Op->getOperand(1);
5540
5541 // The rounding mode is in bits 23:22 of the FPCR.
5542 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5543 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5544 // ((arg - 1) & 3) << 22).
5545 //
5546 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5547 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5548 // generated llvm.set.rounding to ensure this condition.
5549
5550 // Calculate new value of FPCR[23:22].
5551 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5552 DAG.getConstant(1, DL, MVT::i32));
5553 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5554 DAG.getConstant(0x3, DL, MVT::i32));
5555 RMValue =
5556 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5557 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5558 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5559
5560 // Get current value of FPCR.
5561 SDValue Ops[] = {
5562 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5563 SDValue FPCR =
5564 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5565 Chain = FPCR.getValue(1);
5566 FPCR = FPCR.getValue(0);
5567
5568 // Put new rounding mode into FPSCR[23:22].
5569 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5570 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5571 DAG.getConstant(RMMask, DL, MVT::i64));
5572 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5573 SDValue Ops2[] = {
5574 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5575 FPCR};
5576 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5577}
5578
5579SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5580 SelectionDAG &DAG) const {
5581 SDLoc DL(Op);
5582 SDValue Chain = Op->getOperand(0);
5583
5584 // Get current value of FPCR.
5585 SDValue Ops[] = {
5586 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5587 SDValue FPCR =
5588 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5589 Chain = FPCR.getValue(1);
5590 FPCR = FPCR.getValue(0);
5591
5592 // Truncate FPCR to 32 bits.
5593 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5594
5595 return DAG.getMergeValues({Result, Chain}, DL);
5596}
5597
5598SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5599 SelectionDAG &DAG) const {
5600 SDLoc DL(Op);
5601 SDValue Chain = Op->getOperand(0);
5602 SDValue Mode = Op->getOperand(1);
5603
5604 // Extend the specified value to 64 bits.
5605 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5606
5607 // Set new value of FPCR.
5608 SDValue Ops2[] = {
5609 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5610 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5611}
5612
5613SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5614 SelectionDAG &DAG) const {
5615 SDLoc DL(Op);
5616 SDValue Chain = Op->getOperand(0);
5617
5618 // Get current value of FPCR.
5619 SDValue Ops[] = {
5620 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5621 SDValue FPCR =
5622 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5623 Chain = FPCR.getValue(1);
5624 FPCR = FPCR.getValue(0);
5625
5626 // Clear bits that are not reserved.
5627 SDValue FPSCRMasked = DAG.getNode(
5628 ISD::AND, DL, MVT::i64, FPCR,
5630
5631 // Set new value of FPCR.
5632 SDValue Ops2[] = {Chain,
5633 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5634 FPSCRMasked};
5635 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5636}
5637
5638static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5639 SDLoc DL, bool &IsMLA) {
5640 bool IsN0SExt = isSignExtended(N0, DAG);
5641 bool IsN1SExt = isSignExtended(N1, DAG);
5642 if (IsN0SExt && IsN1SExt)
5643 return AArch64ISD::SMULL;
5644
5645 bool IsN0ZExt = isZeroExtended(N0, DAG);
5646 bool IsN1ZExt = isZeroExtended(N1, DAG);
5647
5648 if (IsN0ZExt && IsN1ZExt)
5649 return AArch64ISD::UMULL;
5650
5651 // Select UMULL if we can replace the other operand with an extend.
5652 EVT VT = N0.getValueType();
5653 unsigned EltSize = VT.getScalarSizeInBits();
5654 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5655 if (IsN0ZExt || IsN1ZExt) {
5656 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5657 return AArch64ISD::UMULL;
5658 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5659 DAG.MaskedValueIsZero(N1, Mask)) {
5660 // For v2i64 we look more aggressively at both operands being zero, to avoid
5661 // scalarization.
5662 return AArch64ISD::UMULL;
5663 }
5664
5665 if (IsN0SExt || IsN1SExt) {
5666 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5667 return AArch64ISD::SMULL;
5668 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5669 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5670 return AArch64ISD::SMULL;
5671 }
5672
5673 if (!IsN1SExt && !IsN1ZExt)
5674 return 0;
5675
5676 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5677 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5678 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5679 IsMLA = true;
5680 return AArch64ISD::SMULL;
5681 }
5682 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5683 IsMLA = true;
5684 return AArch64ISD::UMULL;
5685 }
5686 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5687 std::swap(N0, N1);
5688 IsMLA = true;
5689 return AArch64ISD::UMULL;
5690 }
5691 return 0;
5692}
5693
5694SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5695 EVT VT = Op.getValueType();
5696
5697 bool OverrideNEON = !Subtarget->isNeonAvailable();
5698 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5699 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5700
5701 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5702 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5703 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5704 "unexpected type for custom-lowering ISD::MUL");
5705 SDValue N0 = Op.getOperand(0);
5706 SDValue N1 = Op.getOperand(1);
5707 bool isMLA = false;
5708 EVT OVT = VT;
5709 if (VT.is64BitVector()) {
5710 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5711 isNullConstant(N0.getOperand(1)) &&
5713 isNullConstant(N1.getOperand(1))) {
5714 N0 = N0.getOperand(0);
5715 N1 = N1.getOperand(0);
5716 VT = N0.getValueType();
5717 } else {
5718 if (VT == MVT::v1i64) {
5719 if (Subtarget->hasSVE())
5720 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5721 // Fall through to expand this. It is not legal.
5722 return SDValue();
5723 } else
5724 // Other vector multiplications are legal.
5725 return Op;
5726 }
5727 }
5728
5729 SDLoc DL(Op);
5730 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5731
5732 if (!NewOpc) {
5733 if (VT.getVectorElementType() == MVT::i64) {
5734 // If SVE is available then i64 vector multiplications can also be made
5735 // legal.
5736 if (Subtarget->hasSVE())
5737 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5738 // Fall through to expand this. It is not legal.
5739 return SDValue();
5740 } else
5741 // Other vector multiplications are legal.
5742 return Op;
5743 }
5744
5745 // Legalize to a S/UMULL instruction
5746 SDValue Op0;
5747 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5748 if (!isMLA) {
5749 Op0 = skipExtensionForVectorMULL(N0, DAG);
5751 Op1.getValueType().is64BitVector() &&
5752 "unexpected types for extended operands to VMULL");
5753 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5754 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5755 DAG.getConstant(0, DL, MVT::i64));
5756 }
5757 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5758 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5759 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5762 EVT Op1VT = Op1.getValueType();
5763 return DAG.getNode(
5765 DAG.getNode(N0.getOpcode(), DL, VT,
5766 DAG.getNode(NewOpc, DL, VT,
5767 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5768 DAG.getNode(NewOpc, DL, VT,
5769 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5770 DAG.getConstant(0, DL, MVT::i64));
5771}
5772
5773static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5774 int Pattern) {
5775 if (Pattern == AArch64SVEPredPattern::all)
5776 return DAG.getConstant(1, DL, VT);
5777 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5778 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5779}
5780
5782 bool IsSigned, bool IsEqual) {
5783 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5784 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5785
5786 if (!N->getValueType(0).isScalableVector() ||
5787 !isa<ConstantSDNode>(N->getOperand(Op1)))
5788 return SDValue();
5789
5790 SDLoc DL(N);
5791 APInt Y = N->getConstantOperandAPInt(Op1);
5792
5793 // When the second operand is the maximum value, comparisons that include
5794 // equality can never fail and thus we can return an all active predicate.
5795 if (IsEqual)
5796 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5797 return DAG.getConstant(1, DL, N->getValueType(0));
5798
5799 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5800 return SDValue();
5801
5802 APInt X = N->getConstantOperandAPInt(Op0);
5803
5804 bool Overflow;
5805 APInt NumActiveElems =
5806 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5807
5808 if (Overflow)
5809 return SDValue();
5810
5811 if (IsEqual) {
5812 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5813 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5814 : NumActiveElems.uadd_ov(One, Overflow);
5815 if (Overflow)
5816 return SDValue();
5817 }
5818
5819 std::optional<unsigned> PredPattern =
5821 unsigned MinSVEVectorSize = std::max(
5823 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5824 if (PredPattern != std::nullopt &&
5825 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5826 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5827
5828 return SDValue();
5829}
5830
5831// Returns a safe bitcast between two scalable vector predicates, where
5832// any newly created lanes from a widening bitcast are defined as zero.
5834 SDLoc DL(Op);
5835 EVT InVT = Op.getValueType();
5836
5837 assert(InVT.getVectorElementType() == MVT::i1 &&
5838 VT.getVectorElementType() == MVT::i1 &&
5839 "Expected a predicate-to-predicate bitcast");
5841 InVT.isScalableVector() &&
5842 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5843 "Only expect to cast between legal scalable predicate types!");
5844
5845 // Return the operand if the cast isn't changing type,
5846 if (InVT == VT)
5847 return Op;
5848
5849 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5850 // than VT. This will increase the chances of removing casts that introduce
5851 // new lanes, which have to be explicitly zero'd.
5852 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5853 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5854 Op.getOperand(1).getValueType().bitsGT(VT))
5855 Op = Op.getOperand(1);
5856
5857 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5858
5859 // We only have to zero the lanes if new lanes are being defined, e.g. when
5860 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5861 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5862 // we can return here.
5863 if (InVT.bitsGT(VT))
5864 return Reinterpret;
5865
5866 // Check if the other lanes are already known to be zeroed by
5867 // construction.
5869 return Reinterpret;
5870
5871 // Zero the newly introduced lanes.
5872 SDValue Mask = DAG.getConstant(1, DL, InVT);
5873 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5874 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5875}
5876
5877SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5878 SDValue Chain, SDLoc DL,
5879 EVT VT) const {
5880 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
5883 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5884 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5887 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5888 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
5889 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5890 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5891 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5892 Mask);
5893}
5894
5895// Lower an SME LDR/STR ZA intrinsic
5896// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5897// folded into the instruction
5898// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5899// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5900// and tile slice registers
5901// ldr(%tileslice, %ptr, %vecnum)
5902// ->
5903// %svl = rdsvl
5904// %ptr2 = %ptr + %svl * %vecnum
5905// %tileslice2 = %tileslice + %vecnum
5906// ldr [%tileslice2, 0], [%ptr2, 0]
5907// Case 3: If the vecnum is an immediate out of range, then the same is done as
5908// case 2, but the base and slice registers are modified by the greatest
5909// multiple of 15 lower than the vecnum and the remainder is folded into the
5910// instruction. This means that successive loads and stores that are offset from
5911// each other can share the same base and slice register updates.
5912// ldr(%tileslice, %ptr, 22)
5913// ldr(%tileslice, %ptr, 23)
5914// ->
5915// %svl = rdsvl
5916// %ptr2 = %ptr + %svl * 15
5917// %tileslice2 = %tileslice + 15
5918// ldr [%tileslice2, 7], [%ptr2, 7]
5919// ldr [%tileslice2, 8], [%ptr2, 8]
5920// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5921// operand and the immediate can be folded into the instruction, like case 2.
5922// ldr(%tileslice, %ptr, %vecnum + 7)
5923// ldr(%tileslice, %ptr, %vecnum + 8)
5924// ->
5925// %svl = rdsvl
5926// %ptr2 = %ptr + %svl * %vecnum
5927// %tileslice2 = %tileslice + %vecnum
5928// ldr [%tileslice2, 7], [%ptr2, 7]
5929// ldr [%tileslice2, 8], [%ptr2, 8]
5930// Case 5: The vecnum being an add of an immediate out of range is also handled,
5931// in which case the same remainder logic as case 3 is used.
5933 SDLoc DL(N);
5934
5935 SDValue TileSlice = N->getOperand(2);
5936 SDValue Base = N->getOperand(3);
5937 SDValue VecNum = N->getOperand(4);
5938 int32_t ConstAddend = 0;
5939 SDValue VarAddend = VecNum;
5940
5941 // If the vnum is an add of an immediate, we can fold it into the instruction
5942 if (VecNum.getOpcode() == ISD::ADD &&
5943 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5944 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5945 VarAddend = VecNum.getOperand(0);
5946 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5947 ConstAddend = ImmNode->getSExtValue();
5948 VarAddend = SDValue();
5949 }
5950
5951 int32_t ImmAddend = ConstAddend % 16;
5952 if (int32_t C = (ConstAddend - ImmAddend)) {
5953 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5954 VarAddend = VarAddend
5955 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5956 : CVal;
5957 }
5958
5959 if (VarAddend) {
5960 // Get the vector length that will be multiplied by vnum
5961 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5962 DAG.getConstant(1, DL, MVT::i32));
5963
5964 // Multiply SVL and vnum then add it to the base
5965 SDValue Mul = DAG.getNode(
5966 ISD::MUL, DL, MVT::i64,
5967 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5968 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5969 // Just add vnum to the tileslice
5970 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5971 }
5972
5973 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5974 DL, MVT::Other,
5975 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5976 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5977}
5978
5980 SDLoc DL(Op);
5981 SDValue ID =
5982 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
5983
5984 auto Op1 = Op.getOperand(1);
5985 auto Op2 = Op.getOperand(2);
5986 auto Mask = Op.getOperand(3);
5987
5988 EVT Op1VT = Op1.getValueType();
5989 EVT Op2VT = Op2.getValueType();
5990 EVT ResVT = Op.getValueType();
5991
5992 assert((Op1VT.getVectorElementType() == MVT::i8 ||
5993 Op1VT.getVectorElementType() == MVT::i16) &&
5994 "Expected 8-bit or 16-bit characters.");
5995
5996 // Scalable vector type used to wrap operands.
5997 // A single container is enough for both operands because ultimately the
5998 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5999 EVT OpContainerVT = Op1VT.isScalableVector()
6000 ? Op1VT
6002
6003 if (Op2VT.is128BitVector()) {
6004 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6005 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6006 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6007 if (ResVT.isScalableVector())
6008 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6009 DAG.getTargetConstant(0, DL, MVT::i64));
6010 } else {
6011 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6012 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6013 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6014 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6015 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6016 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6017 DAG.getConstant(0, DL, MVT::i64));
6018 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6019 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6020 }
6021
6022 // If the result is scalable, we just need to carry out the MATCH.
6023 if (ResVT.isScalableVector())
6024 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6025
6026 // If the result is fixed, we can still use MATCH but we need to wrap the
6027 // first operand and the mask in scalable vectors before doing so.
6028
6029 // Wrap the operands.
6030 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6031 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6032 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6033
6034 // Carry out the match.
6035 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6036 ID, Mask, Op1, Op2);
6037
6038 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6039 // (v16i8/v8i8).
6040 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6041 Match = convertFromScalableVector(DAG, Op1VT, Match);
6042 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6043}
6044
6045SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6046 SelectionDAG &DAG) const {
6047 unsigned IntNo = Op.getConstantOperandVal(1);
6048 SDLoc DL(Op);
6049 switch (IntNo) {
6050 default:
6051 return SDValue(); // Don't custom lower most intrinsics.
6052 case Intrinsic::aarch64_prefetch: {
6053 SDValue Chain = Op.getOperand(0);
6054 SDValue Addr = Op.getOperand(2);
6055
6056 unsigned IsWrite = Op.getConstantOperandVal(3);
6057 unsigned Locality = Op.getConstantOperandVal(4);
6058 unsigned IsStream = Op.getConstantOperandVal(5);
6059 unsigned IsData = Op.getConstantOperandVal(6);
6060 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6061 (!IsData << 3) | // IsDataCache bit
6062 (Locality << 1) | // Cache level bits
6063 (unsigned)IsStream; // Stream bit
6064
6065 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6066 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6067 }
6068 case Intrinsic::aarch64_sme_str:
6069 case Intrinsic::aarch64_sme_ldr: {
6070 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6071 }
6072 case Intrinsic::aarch64_sme_za_enable:
6073 return DAG.getNode(
6074 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6075 Op->getOperand(0), // Chain
6076 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6077 case Intrinsic::aarch64_sme_za_disable:
6078 return DAG.getNode(
6079 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6080 Op->getOperand(0), // Chain
6081 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6082 }
6083}
6084
6085SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6086 SelectionDAG &DAG) const {
6087 unsigned IntNo = Op.getConstantOperandVal(1);
6088 SDLoc DL(Op);
6089 switch (IntNo) {
6090 default:
6091 return SDValue(); // Don't custom lower most intrinsics.
6092 case Intrinsic::aarch64_mops_memset_tag: {
6093 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6094 SDValue Chain = Node->getChain();
6095 SDValue Dst = Op.getOperand(2);
6096 SDValue Val = Op.getOperand(3);
6097 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6098 SDValue Size = Op.getOperand(4);
6099 auto Alignment = Node->getMemOperand()->getAlign();
6100 bool IsVol = Node->isVolatile();
6101 auto DstPtrInfo = Node->getPointerInfo();
6102
6103 const auto &SDI =
6104 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6105 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6106 Chain, Dst, Val, Size, Alignment, IsVol,
6107 DstPtrInfo, MachinePointerInfo{});
6108
6109 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6110 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6111 // LowerOperationWrapper will complain that the number of results has
6112 // changed.
6113 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6114 }
6115 }
6116}
6117
6118SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6119 SelectionDAG &DAG) const {
6120 unsigned IntNo = Op.getConstantOperandVal(0);
6121 SDLoc DL(Op);
6122 switch (IntNo) {
6123 default: return SDValue(); // Don't custom lower most intrinsics.
6124 case Intrinsic::thread_pointer: {
6125 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6126 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6127 }
6128 case Intrinsic::aarch64_sve_whilewr_b:
6129 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6130 Op.getOperand(1), Op.getOperand(2),
6131 DAG.getConstant(1, DL, MVT::i64));
6132 case Intrinsic::aarch64_sve_whilewr_h:
6133 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6134 Op.getOperand(1), Op.getOperand(2),
6135 DAG.getConstant(2, DL, MVT::i64));
6136 case Intrinsic::aarch64_sve_whilewr_s:
6137 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6138 Op.getOperand(1), Op.getOperand(2),
6139 DAG.getConstant(4, DL, MVT::i64));
6140 case Intrinsic::aarch64_sve_whilewr_d:
6141 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6142 Op.getOperand(1), Op.getOperand(2),
6143 DAG.getConstant(8, DL, MVT::i64));
6144 case Intrinsic::aarch64_sve_whilerw_b:
6145 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6146 Op.getOperand(1), Op.getOperand(2),
6147 DAG.getConstant(1, DL, MVT::i64));
6148 case Intrinsic::aarch64_sve_whilerw_h:
6149 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6150 Op.getOperand(1), Op.getOperand(2),
6151 DAG.getConstant(2, DL, MVT::i64));
6152 case Intrinsic::aarch64_sve_whilerw_s:
6153 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6154 Op.getOperand(1), Op.getOperand(2),
6155 DAG.getConstant(4, DL, MVT::i64));
6156 case Intrinsic::aarch64_sve_whilerw_d:
6157 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6158 Op.getOperand(1), Op.getOperand(2),
6159 DAG.getConstant(8, DL, MVT::i64));
6160 case Intrinsic::aarch64_neon_abs: {
6161 EVT Ty = Op.getValueType();
6162 if (Ty == MVT::i64) {
6163 SDValue Result =
6164 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6165 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6166 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6167 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6168 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6169 } else {
6170 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6171 }
6172 }
6173 case Intrinsic::aarch64_neon_pmull64: {
6174 SDValue LHS = Op.getOperand(1);
6175 SDValue RHS = Op.getOperand(2);
6176
6177 std::optional<uint64_t> LHSLane =
6179 std::optional<uint64_t> RHSLane =
6181
6182 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6183 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6184
6185 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6186 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6187 // which ISel recognizes better. For example, generate a ldr into d*
6188 // registers as opposed to a GPR load followed by a fmov.
6189 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6190 std::optional<uint64_t> OtherLane,
6191 const SDLoc &DL,
6192 SelectionDAG &DAG) -> SDValue {
6193 // If the operand is an higher half itself, rewrite it to
6194 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6195 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6196 if (NLane == 1)
6197 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6198 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6199
6200 // Operand N is not a higher half but the other operand is.
6201 if (OtherLane == 1) {
6202 // If this operand is a lower half, rewrite it to
6203 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6204 // align lanes of two operands. A roundtrip sequence (to move from lane
6205 // 1 to lane 0) is like this:
6206 // mov x8, v0.d[1]
6207 // fmov d0, x8
6208 if (NLane == 0)
6209 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6210 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6211 N.getOperand(0),
6212 DAG.getConstant(0, DL, MVT::i64)),
6213 DAG.getConstant(1, DL, MVT::i64));
6214
6215 // Otherwise just dup from main to all lanes.
6216 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6217 }
6218
6219 // Neither operand is an extract of higher half, so codegen may just use
6220 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6221 assert(N.getValueType() == MVT::i64 &&
6222 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6223 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6224 };
6225
6226 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6227 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6228
6229 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6230 }
6231 case Intrinsic::aarch64_neon_smax:
6232 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6233 Op.getOperand(2));
6234 case Intrinsic::aarch64_neon_umax:
6235 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6236 Op.getOperand(2));
6237 case Intrinsic::aarch64_neon_smin:
6238 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6239 Op.getOperand(2));
6240 case Intrinsic::aarch64_neon_umin:
6241 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6242 Op.getOperand(2));
6243 case Intrinsic::aarch64_neon_scalar_sqxtn:
6244 case Intrinsic::aarch64_neon_scalar_sqxtun:
6245 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6246 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6247 if (Op.getValueType() == MVT::i32)
6248 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6249 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6250 Op.getOperand(0),
6251 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6252 Op.getOperand(1))));
6253 return SDValue();
6254 }
6255 case Intrinsic::aarch64_neon_sqxtn:
6256 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6257 Op.getOperand(1));
6258 case Intrinsic::aarch64_neon_sqxtun:
6259 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6260 Op.getOperand(1));
6261 case Intrinsic::aarch64_neon_uqxtn:
6262 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6263 Op.getOperand(1));
6264 case Intrinsic::aarch64_neon_sqshrn:
6265 if (Op.getValueType().isVector())
6266 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6267 DAG.getNode(AArch64ISD::VASHR, DL,
6268 Op.getOperand(1).getValueType(),
6269 Op.getOperand(1), Op.getOperand(2)));
6270 return SDValue();
6271 case Intrinsic::aarch64_neon_sqshrun:
6272 if (Op.getValueType().isVector())
6273 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6274 DAG.getNode(AArch64ISD::VASHR, DL,
6275 Op.getOperand(1).getValueType(),
6276 Op.getOperand(1), Op.getOperand(2)));
6277 return SDValue();
6278 case Intrinsic::aarch64_neon_uqshrn:
6279 if (Op.getValueType().isVector())
6280 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6281 DAG.getNode(AArch64ISD::VLSHR, DL,
6282 Op.getOperand(1).getValueType(),
6283 Op.getOperand(1), Op.getOperand(2)));
6284 return SDValue();
6285 case Intrinsic::aarch64_neon_sqrshrn:
6286 if (Op.getValueType().isVector())
6287 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6288 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6289 Op.getOperand(1).getValueType(),
6290 Op.getOperand(1), Op.getOperand(2)));
6291 return SDValue();
6292 case Intrinsic::aarch64_neon_sqrshrun:
6293 if (Op.getValueType().isVector())
6294 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6295 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6296 Op.getOperand(1).getValueType(),
6297 Op.getOperand(1), Op.getOperand(2)));
6298 return SDValue();
6299 case Intrinsic::aarch64_neon_uqrshrn:
6300 if (Op.getValueType().isVector())
6301 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6302 DAG.getNode(AArch64ISD::URSHR_I, DL,
6303 Op.getOperand(1).getValueType(),
6304 Op.getOperand(1), Op.getOperand(2)));
6305 return SDValue();
6306 case Intrinsic::aarch64_neon_sqadd:
6307 if (Op.getValueType().isVector())
6308 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6309 Op.getOperand(2));
6310 return SDValue();
6311 case Intrinsic::aarch64_neon_sqsub:
6312 if (Op.getValueType().isVector())
6313 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6314 Op.getOperand(2));
6315 return SDValue();
6316 case Intrinsic::aarch64_neon_uqadd:
6317 if (Op.getValueType().isVector())
6318 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6319 Op.getOperand(2));
6320 return SDValue();
6321 case Intrinsic::aarch64_neon_uqsub:
6322 if (Op.getValueType().isVector())
6323 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6324 Op.getOperand(2));
6325 return SDValue();
6326 case Intrinsic::aarch64_sve_whilelt:
6327 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6328 /*IsEqual=*/false);
6329 case Intrinsic::aarch64_sve_whilels:
6330 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6331 /*IsEqual=*/true);
6332 case Intrinsic::aarch64_sve_whilele:
6333 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6334 /*IsEqual=*/true);
6335 case Intrinsic::aarch64_sve_sunpkhi:
6336 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6337 Op.getOperand(1));
6338 case Intrinsic::aarch64_sve_sunpklo:
6339 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6340 Op.getOperand(1));
6341 case Intrinsic::aarch64_sve_uunpkhi:
6342 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6343 Op.getOperand(1));
6344 case Intrinsic::aarch64_sve_uunpklo:
6345 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6346 Op.getOperand(1));
6347 case Intrinsic::aarch64_sve_clasta_n:
6348 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6349 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6350 case Intrinsic::aarch64_sve_clastb_n:
6351 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6352 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6353 case Intrinsic::aarch64_sve_lasta:
6354 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6355 Op.getOperand(1), Op.getOperand(2));
6356 case Intrinsic::aarch64_sve_lastb:
6357 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6358 Op.getOperand(1), Op.getOperand(2));
6359 case Intrinsic::aarch64_sve_rev:
6360 return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),
6361 Op.getOperand(1));
6362 case Intrinsic::aarch64_sve_tbl:
6363 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6364 Op.getOperand(2));
6365 case Intrinsic::aarch64_sve_trn1:
6366 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6367 Op.getOperand(1), Op.getOperand(2));
6368 case Intrinsic::aarch64_sve_trn2:
6369 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6370 Op.getOperand(1), Op.getOperand(2));
6371 case Intrinsic::aarch64_sve_uzp1:
6372 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6373 Op.getOperand(1), Op.getOperand(2));
6374 case Intrinsic::aarch64_sve_uzp2:
6375 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6376 Op.getOperand(1), Op.getOperand(2));
6377 case Intrinsic::aarch64_sve_zip1:
6378 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6379 Op.getOperand(1), Op.getOperand(2));
6380 case Intrinsic::aarch64_sve_zip2:
6381 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6382 Op.getOperand(1), Op.getOperand(2));
6383 case Intrinsic::aarch64_sve_splice:
6384 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6385 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6386 case Intrinsic::aarch64_sve_ptrue:
6387 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6388 case Intrinsic::aarch64_sve_clz:
6389 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6390 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6391 case Intrinsic::aarch64_sme_cntsb:
6392 return DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6393 DAG.getConstant(1, DL, MVT::i32));
6394 case Intrinsic::aarch64_sme_cntsh: {
6395 SDValue One = DAG.getConstant(1, DL, MVT::i32);
6396 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(), One);
6397 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes, One);
6398 }
6399 case Intrinsic::aarch64_sme_cntsw: {
6400 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6401 DAG.getConstant(1, DL, MVT::i32));
6402 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6403 DAG.getConstant(2, DL, MVT::i32));
6404 }
6405 case Intrinsic::aarch64_sme_cntsd: {
6406 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6407 DAG.getConstant(1, DL, MVT::i32));
6408 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6409 DAG.getConstant(3, DL, MVT::i32));
6410 }
6411 case Intrinsic::aarch64_sve_cnt: {
6412 SDValue Data = Op.getOperand(3);
6413 // CTPOP only supports integer operands.
6414 if (Data.getValueType().isFloatingPoint())
6415 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6416 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6417 Op.getOperand(2), Data, Op.getOperand(1));
6418 }
6419 case Intrinsic::aarch64_sve_dupq_lane:
6420 return LowerDUPQLane(Op, DAG);
6421 case Intrinsic::aarch64_sve_convert_from_svbool:
6422 if (Op.getValueType() == MVT::aarch64svcount)
6423 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6424 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6425 case Intrinsic::aarch64_sve_convert_to_svbool:
6426 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6427 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6428 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6429 case Intrinsic::aarch64_sve_fneg:
6430 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6431 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6432 case Intrinsic::aarch64_sve_frintp:
6433 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6434 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6435 case Intrinsic::aarch64_sve_frintm:
6436 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6437 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6438 case Intrinsic::aarch64_sve_frinti:
6439 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6440 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6441 Op.getOperand(1));
6442 case Intrinsic::aarch64_sve_frintx:
6443 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6444 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6445 case Intrinsic::aarch64_sve_frinta:
6446 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6447 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6448 case Intrinsic::aarch64_sve_frintn:
6449 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6450 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6451 Op.getOperand(1));
6452 case Intrinsic::aarch64_sve_frintz:
6453 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6454 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6455 case Intrinsic::aarch64_sve_ucvtf:
6456 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6457 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6458 Op.getOperand(1));
6459 case Intrinsic::aarch64_sve_scvtf:
6460 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6461 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6462 Op.getOperand(1));
6463 case Intrinsic::aarch64_sve_fcvtzu:
6464 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6465 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6466 case Intrinsic::aarch64_sve_fcvtzs:
6467 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6468 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6469 case Intrinsic::aarch64_sve_fsqrt:
6470 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6471 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6472 case Intrinsic::aarch64_sve_frecpx:
6473 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6474 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6475 case Intrinsic::aarch64_sve_frecpe_x:
6476 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6477 Op.getOperand(1));
6478 case Intrinsic::aarch64_sve_frecps_x:
6479 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6480 Op.getOperand(1), Op.getOperand(2));
6481 case Intrinsic::aarch64_sve_frsqrte_x:
6482 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6483 Op.getOperand(1));
6484 case Intrinsic::aarch64_sve_frsqrts_x:
6485 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6486 Op.getOperand(1), Op.getOperand(2));
6487 case Intrinsic::aarch64_sve_fabs:
6488 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6489 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6490 case Intrinsic::aarch64_sve_abs:
6491 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6492 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6493 case Intrinsic::aarch64_sve_neg:
6494 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6495 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6496 case Intrinsic::aarch64_sve_insr: {
6497 SDValue Scalar = Op.getOperand(2);
6498 EVT ScalarTy = Scalar.getValueType();
6499 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6500 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6501
6502 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6503 Op.getOperand(1), Scalar);
6504 }
6505 case Intrinsic::aarch64_sve_rbit:
6506 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6507 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6508 Op.getOperand(1));
6509 case Intrinsic::aarch64_sve_revb:
6510 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6511 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6512 case Intrinsic::aarch64_sve_revh:
6513 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6514 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6515 case Intrinsic::aarch64_sve_revw:
6516 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6517 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6518 case Intrinsic::aarch64_sve_revd:
6519 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6520 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6521 case Intrinsic::aarch64_sve_sxtb:
6522 return DAG.getNode(
6523 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6524 Op.getOperand(2), Op.getOperand(3),
6525 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6526 Op.getOperand(1));
6527 case Intrinsic::aarch64_sve_sxth:
6528 return DAG.getNode(
6529 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6530 Op.getOperand(2), Op.getOperand(3),
6531 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6532 Op.getOperand(1));
6533 case Intrinsic::aarch64_sve_sxtw:
6534 return DAG.getNode(
6535 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6536 Op.getOperand(2), Op.getOperand(3),
6537 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6538 Op.getOperand(1));
6539 case Intrinsic::aarch64_sve_uxtb:
6540 return DAG.getNode(
6541 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6542 Op.getOperand(2), Op.getOperand(3),
6543 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6544 Op.getOperand(1));
6545 case Intrinsic::aarch64_sve_uxth:
6546 return DAG.getNode(
6547 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6548 Op.getOperand(2), Op.getOperand(3),
6549 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6550 Op.getOperand(1));
6551 case Intrinsic::aarch64_sve_uxtw:
6552 return DAG.getNode(
6553 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6554 Op.getOperand(2), Op.getOperand(3),
6555 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6556 Op.getOperand(1));
6557 case Intrinsic::localaddress: {
6558 const auto &MF = DAG.getMachineFunction();
6559 const auto *RegInfo = Subtarget->getRegisterInfo();
6560 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6561 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6562 Op.getSimpleValueType());
6563 }
6564
6565 case Intrinsic::eh_recoverfp: {
6566 // FIXME: This needs to be implemented to correctly handle highly aligned
6567 // stack objects. For now we simply return the incoming FP. Refer D53541
6568 // for more details.
6569 SDValue FnOp = Op.getOperand(1);
6570 SDValue IncomingFPOp = Op.getOperand(2);
6571 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6572 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6573 if (!Fn)
6575 "llvm.eh.recoverfp must take a function as the first argument");
6576 return IncomingFPOp;
6577 }
6578
6579 case Intrinsic::aarch64_neon_vsri:
6580 case Intrinsic::aarch64_neon_vsli:
6581 case Intrinsic::aarch64_sve_sri:
6582 case Intrinsic::aarch64_sve_sli: {
6583 EVT Ty = Op.getValueType();
6584
6585 if (!Ty.isVector())
6586 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6587
6588 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6589
6590 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6591 IntNo == Intrinsic::aarch64_sve_sri;
6592 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6593 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6594 Op.getOperand(3));
6595 }
6596
6597 case Intrinsic::aarch64_neon_srhadd:
6598 case Intrinsic::aarch64_neon_urhadd:
6599 case Intrinsic::aarch64_neon_shadd:
6600 case Intrinsic::aarch64_neon_uhadd: {
6601 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6602 IntNo == Intrinsic::aarch64_neon_shadd);
6603 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6604 IntNo == Intrinsic::aarch64_neon_urhadd);
6605 unsigned Opcode = IsSignedAdd
6606 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6607 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6608 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6609 Op.getOperand(2));
6610 }
6611 case Intrinsic::aarch64_neon_saddlp:
6612 case Intrinsic::aarch64_neon_uaddlp: {
6613 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6614 ? AArch64ISD::UADDLP
6615 : AArch64ISD::SADDLP;
6616 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6617 }
6618 case Intrinsic::aarch64_neon_sdot:
6619 case Intrinsic::aarch64_neon_udot:
6620 case Intrinsic::aarch64_sve_sdot:
6621 case Intrinsic::aarch64_sve_udot: {
6622 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6623 IntNo == Intrinsic::aarch64_sve_udot)
6624 ? AArch64ISD::UDOT
6625 : AArch64ISD::SDOT;
6626 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6627 Op.getOperand(2), Op.getOperand(3));
6628 }
6629 case Intrinsic::aarch64_neon_usdot:
6630 case Intrinsic::aarch64_sve_usdot: {
6631 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6632 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6633 }
6634 case Intrinsic::aarch64_neon_saddlv:
6635 case Intrinsic::aarch64_neon_uaddlv: {
6636 EVT OpVT = Op.getOperand(1).getValueType();
6637 EVT ResVT = Op.getValueType();
6638 assert(
6639 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6640 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6641 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6642 "Unexpected aarch64_neon_u/saddlv type");
6643 (void)OpVT;
6644 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6645 SDValue ADDLV = DAG.getNode(
6646 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6647 : AArch64ISD::SADDLV,
6648 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6649 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6650 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6651 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6652 return EXTRACT_VEC_ELT;
6653 }
6654 case Intrinsic::experimental_cttz_elts: {
6655 SDValue CttzOp = Op.getOperand(1);
6656 EVT VT = CttzOp.getValueType();
6657 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6658
6659 if (VT.isFixedLengthVector()) {
6660 // We can use SVE instructions to lower this intrinsic by first creating
6661 // an SVE predicate register mask from the fixed-width vector.
6662 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6663 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6664 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6665 }
6666
6667 SDValue NewCttzElts =
6668 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6669 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6670 }
6671 case Intrinsic::experimental_vector_match: {
6672 return LowerVectorMatch(Op, DAG);
6673 }
6674 }
6675}
6676
6677bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6678 if (VT.getVectorElementType() == MVT::i8 ||
6679 VT.getVectorElementType() == MVT::i16) {
6680 EltTy = MVT::i32;
6681 return true;
6682 }
6683 return false;
6684}
6685
6686bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6687 EVT DataVT) const {
6688 const EVT IndexVT = Extend.getOperand(0).getValueType();
6689 // SVE only supports implicit extension of 32-bit indices.
6690 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6691 return false;
6692
6693 // Indices cannot be smaller than the main data type.
6694 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6695 return false;
6696
6697 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6698 // element container type, which would violate the previous clause.
6699 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6700}
6701
6702bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6703 EVT ExtVT = ExtVal.getValueType();
6704 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6705 return false;
6706
6707 // It may be worth creating extending masked loads if there are multiple
6708 // masked loads using the same predicate. That way we'll end up creating
6709 // extending masked loads that may then get split by the legaliser. This
6710 // results in just one set of predicate unpacks at the start, instead of
6711 // multiple sets of vector unpacks after each load.
6712 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6713 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6714 // Disable extending masked loads for fixed-width for now, since the code
6715 // quality doesn't look great.
6716 if (!ExtVT.isScalableVector())
6717 return false;
6718
6719 unsigned NumExtMaskedLoads = 0;
6720 for (auto *U : Ld->getMask()->users())
6721 if (isa<MaskedLoadSDNode>(U))
6722 NumExtMaskedLoads++;
6723
6724 if (NumExtMaskedLoads <= 1)
6725 return false;
6726 }
6727 }
6728
6729 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6730 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6731 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6732}
6733
6734unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6735 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6736 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6737 AArch64ISD::GLD1_MERGE_ZERO},
6738 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6739 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6740 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6741 AArch64ISD::GLD1_MERGE_ZERO},
6742 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6743 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6744 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6745 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6746 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6747 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6748 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6749 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6750 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6751 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6752 };
6753 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6754 return AddrModes.find(Key)->second;
6755}
6756
6757unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6758 switch (Opcode) {
6759 default:
6760 llvm_unreachable("unimplemented opcode");
6761 return Opcode;
6762 case AArch64ISD::GLD1_MERGE_ZERO:
6763 return AArch64ISD::GLD1S_MERGE_ZERO;
6764 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6765 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6766 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6767 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6768 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6769 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6770 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6771 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6772 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6773 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6774 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6775 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6776 }
6777}
6778
6779SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6780 SelectionDAG &DAG) const {
6781 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6782
6783 SDLoc DL(Op);
6784 SDValue Chain = MGT->getChain();
6785 SDValue PassThru = MGT->getPassThru();
6786 SDValue Mask = MGT->getMask();
6787 SDValue BasePtr = MGT->getBasePtr();
6788 SDValue Index = MGT->getIndex();
6789 SDValue Scale = MGT->getScale();
6790 EVT VT = Op.getValueType();
6791 EVT MemVT = MGT->getMemoryVT();
6792 ISD::LoadExtType ExtType = MGT->getExtensionType();
6793 ISD::MemIndexType IndexType = MGT->getIndexType();
6794
6795 // SVE supports zero (and so undef) passthrough values only, everything else
6796 // must be handled manually by an explicit select on the load's output.
6797 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6798 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6799 SDValue Load =
6800 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6801 MGT->getMemOperand(), IndexType, ExtType);
6802 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6803 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6804 }
6805
6806 bool IsScaled = MGT->isIndexScaled();
6807 bool IsSigned = MGT->isIndexSigned();
6808
6809 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6810 // must be calculated before hand.
6811 uint64_t ScaleVal = Scale->getAsZExtVal();
6812 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6813 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6814 EVT IndexVT = Index.getValueType();
6815 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6816 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6817 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6818
6819 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6820 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6821 MGT->getMemOperand(), IndexType, ExtType);
6822 }
6823
6824 // Lower fixed length gather to a scalable equivalent.
6825 if (VT.isFixedLengthVector()) {
6826 assert(Subtarget->useSVEForFixedLengthVectors() &&
6827 "Cannot lower when not using SVE for fixed vectors!");
6828
6829 // NOTE: Handle floating-point as if integer then bitcast the result.
6831 MemVT = MemVT.changeVectorElementTypeToInteger();
6832
6833 // Find the smallest integer fixed length vector we can use for the gather.
6834 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6835 if (DataVT.getVectorElementType() == MVT::i64 ||
6836 Index.getValueType().getVectorElementType() == MVT::i64 ||
6837 Mask.getValueType().getVectorElementType() == MVT::i64)
6838 PromotedVT = VT.changeVectorElementType(MVT::i64);
6839
6840 // Promote vector operands except for passthrough, which we know is either
6841 // undef or zero, and thus best constructed directly.
6842 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6843 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6844 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6845
6846 // A promoted result type forces the need for an extending load.
6847 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6848 ExtType = ISD::EXTLOAD;
6849
6850 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6851
6852 // Convert fixed length vector operands to scalable.
6853 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6854 Index = convertToScalableVector(DAG, ContainerVT, Index);
6856 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6857 : DAG.getConstant(0, DL, ContainerVT);
6858
6859 // Emit equivalent scalable vector gather.
6860 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6861 SDValue Load =
6862 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6863 Ops, MGT->getMemOperand(), IndexType, ExtType);
6864
6865 // Extract fixed length data then convert to the required result type.
6866 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6867 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6868 if (VT.isFloatingPoint())
6869 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6870
6871 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6872 }
6873
6874 // Everything else is legal.
6875 return Op;
6876}
6877
6878SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6879 SelectionDAG &DAG) const {
6880 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6881
6882 SDLoc DL(Op);
6883 SDValue Chain = MSC->getChain();
6884 SDValue StoreVal = MSC->getValue();
6885 SDValue Mask = MSC->getMask();
6886 SDValue BasePtr = MSC->getBasePtr();
6887 SDValue Index = MSC->getIndex();
6888 SDValue Scale = MSC->getScale();
6889 EVT VT = StoreVal.getValueType();
6890 EVT MemVT = MSC->getMemoryVT();
6891 ISD::MemIndexType IndexType = MSC->getIndexType();
6892 bool Truncating = MSC->isTruncatingStore();
6893
6894 bool IsScaled = MSC->isIndexScaled();
6895 bool IsSigned = MSC->isIndexSigned();
6896
6897 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6898 // must be calculated before hand.
6899 uint64_t ScaleVal = Scale->getAsZExtVal();
6900 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6901 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6902 EVT IndexVT = Index.getValueType();
6903 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6904 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6905 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6906
6907 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6908 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6909 MSC->getMemOperand(), IndexType, Truncating);
6910 }
6911
6912 // Lower fixed length scatter to a scalable equivalent.
6913 if (VT.isFixedLengthVector()) {
6914 assert(Subtarget->useSVEForFixedLengthVectors() &&
6915 "Cannot lower when not using SVE for fixed vectors!");
6916
6917 // Once bitcast we treat floating-point scatters as if integer.
6918 if (VT.isFloatingPoint()) {
6920 MemVT = MemVT.changeVectorElementTypeToInteger();
6921 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6922 }
6923
6924 // Find the smallest integer fixed length vector we can use for the scatter.
6925 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6926 if (VT.getVectorElementType() == MVT::i64 ||
6927 Index.getValueType().getVectorElementType() == MVT::i64 ||
6928 Mask.getValueType().getVectorElementType() == MVT::i64)
6929 PromotedVT = VT.changeVectorElementType(MVT::i64);
6930
6931 // Promote vector operands.
6932 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6933 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6934 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6935 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6936
6937 // A promoted value type forces the need for a truncating store.
6938 if (PromotedVT != VT)
6939 Truncating = true;
6940
6941 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6942
6943 // Convert fixed length vector operands to scalable.
6944 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6945 Index = convertToScalableVector(DAG, ContainerVT, Index);
6947 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6948
6949 // Emit equivalent scalable vector scatter.
6950 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6951 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6952 MSC->getMemOperand(), IndexType, Truncating);
6953 }
6954
6955 // Everything else is legal.
6956 return Op;
6957}
6958
6959SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6960 SDLoc DL(Op);
6961 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6962 assert(LoadNode && "Expected custom lowering of a masked load node");
6963 EVT VT = Op->getValueType(0);
6964
6965 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6966 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6967
6968 SDValue PassThru = LoadNode->getPassThru();
6969 SDValue Mask = LoadNode->getMask();
6970
6971 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6972 return Op;
6973
6975 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6976 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6977 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6978 LoadNode->getExtensionType());
6979
6980 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6981
6982 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6983}
6984
6985// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6987 EVT VT, EVT MemVT,
6988 SelectionDAG &DAG) {
6989 assert(VT.isVector() && "VT should be a vector type");
6990 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6991
6992 SDValue Value = ST->getValue();
6993
6994 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6995 // the word lane which represent the v4i8 subvector. It optimizes the store
6996 // to:
6997 //
6998 // xtn v0.8b, v0.8h
6999 // str s0, [x0]
7000
7001 SDValue Undef = DAG.getUNDEF(MVT::i16);
7002 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7003 {Undef, Undef, Undef, Undef});
7004
7005 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7006 Value, UndefVec);
7007 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7008
7009 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7010 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7011 Trunc, DAG.getConstant(0, DL, MVT::i64));
7012
7013 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7014 ST->getBasePtr(), ST->getMemOperand());
7015}
7016
7018 SDLoc DL(Op);
7019 SDValue Src = Op.getOperand(0);
7020 MVT DestVT = Op.getSimpleValueType();
7021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7022 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
7023
7024 unsigned SrcAS = N->getSrcAddressSpace();
7025 unsigned DestAS = N->getDestAddressSpace();
7026 assert(SrcAS != DestAS &&
7027 "addrspacecast must be between different address spaces");
7028 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7029 TLI.getTargetMachine().getPointerSize(DestAS) &&
7030 "addrspacecast must be between different ptr sizes");
7031 (void)TLI;
7032
7033 if (SrcAS == ARM64AS::PTR32_SPTR) {
7034 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7035 DAG.getTargetConstant(0, DL, DestVT));
7036 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7037 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7038 DAG.getTargetConstant(0, DL, DestVT));
7039 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7040 (DestAS == ARM64AS::PTR32_UPTR)) {
7041 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7042 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7043 return Trunc;
7044 } else {
7045 return Src;
7046 }
7047}
7048
7049// Custom lowering for any store, vector or scalar and/or default or with
7050// a truncate operations. Currently only custom lower truncate operation
7051// from vector v4i16 to v4i8 or volatile stores of i128.
7052SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7053 SelectionDAG &DAG) const {
7054 SDLoc Dl(Op);
7055 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7056 assert (StoreNode && "Can only custom lower store nodes");
7057
7058 SDValue Value = StoreNode->getValue();
7059
7060 EVT VT = Value.getValueType();
7061 EVT MemVT = StoreNode->getMemoryVT();
7062
7063 if (VT.isVector()) {
7065 VT,
7066 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7067 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7068
7069 unsigned AS = StoreNode->getAddressSpace();
7070 Align Alignment = StoreNode->getAlign();
7071 if (Alignment < MemVT.getStoreSize() &&
7072 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7073 StoreNode->getMemOperand()->getFlags(),
7074 nullptr)) {
7075 return scalarizeVectorStore(StoreNode, DAG);
7076 }
7077
7078 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7079 MemVT == MVT::v4i8) {
7080 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7081 }
7082 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7083 // the custom lowering, as there are no un-paired non-temporal stores and
7084 // legalization will break up 256 bit inputs.
7086 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7087 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7088 (MemVT.getScalarSizeInBits() == 8u ||
7089 MemVT.getScalarSizeInBits() == 16u ||
7090 MemVT.getScalarSizeInBits() == 32u ||
7091 MemVT.getScalarSizeInBits() == 64u)) {
7092 SDValue Lo =
7095 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7096 SDValue Hi =
7099 StoreNode->getValue(),
7100 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7102 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7103 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7104 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7105 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7106 return Result;
7107 }
7108 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7109 return LowerStore128(Op, DAG);
7110 } else if (MemVT == MVT::i64x8) {
7111 SDValue Value = StoreNode->getValue();
7112 assert(Value->getValueType(0) == MVT::i64x8);
7113 SDValue Chain = StoreNode->getChain();
7114 SDValue Base = StoreNode->getBasePtr();
7115 EVT PtrVT = Base.getValueType();
7116 for (unsigned i = 0; i < 8; i++) {
7117 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7118 Value, DAG.getConstant(i, Dl, MVT::i32));
7119 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7120 DAG.getConstant(i * 8, Dl, PtrVT));
7121 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7122 StoreNode->getBaseAlign());
7123 }
7124 return Chain;
7125 }
7126
7127 return SDValue();
7128}
7129
7130/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7131SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7132 SelectionDAG &DAG) const {
7133 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7134 assert(StoreNode->getMemoryVT() == MVT::i128);
7135 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7136
7137 bool IsStoreRelease =
7139 if (StoreNode->isAtomic())
7140 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7141 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7144
7145 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7146 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7147 ? StoreNode->getOperand(1)
7148 : StoreNode->getOperand(2);
7149 SDLoc DL(Op);
7150 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7151 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7152 if (DAG.getDataLayout().isBigEndian())
7153 std::swap(StoreValue.first, StoreValue.second);
7155 Opcode, DL, DAG.getVTList(MVT::Other),
7156 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7157 StoreNode->getBasePtr()},
7158 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7159 return Result;
7160}
7161
7162SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7163 SelectionDAG &DAG) const {
7164 SDLoc DL(Op);
7165 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7166 assert(LoadNode && "Expected custom lowering of a load node");
7167
7168 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7170 SDValue Base = LoadNode->getBasePtr();
7171 SDValue Chain = LoadNode->getChain();
7172 EVT PtrVT = Base.getValueType();
7173 for (unsigned i = 0; i < 8; i++) {
7174 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7175 DAG.getConstant(i * 8, DL, PtrVT));
7176 SDValue Part =
7177 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7178 LoadNode->getBaseAlign());
7179 Ops.push_back(Part);
7180 Chain = SDValue(Part.getNode(), 1);
7181 }
7182 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7183 return DAG.getMergeValues({Loaded, Chain}, DL);
7184 }
7185
7186 // Custom lowering for extending v4i8 vector loads.
7187 EVT VT = Op->getValueType(0);
7188 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7189
7190 if (LoadNode->getMemoryVT() != MVT::v4i8)
7191 return SDValue();
7192
7193 // Avoid generating unaligned loads.
7194 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7195 return SDValue();
7196
7197 unsigned ExtType;
7198 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7199 ExtType = ISD::SIGN_EXTEND;
7200 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7201 LoadNode->getExtensionType() == ISD::EXTLOAD)
7202 ExtType = ISD::ZERO_EXTEND;
7203 else
7204 return SDValue();
7205
7206 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7207 LoadNode->getBasePtr(), MachinePointerInfo());
7208 SDValue Chain = Load.getValue(1);
7209 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7210 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7211 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7212 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7213 DAG.getConstant(0, DL, MVT::i64));
7214 if (VT == MVT::v4i32)
7215 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7216 return DAG.getMergeValues({Ext, Chain}, DL);
7217}
7218
7219SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7220 SelectionDAG &DAG) const {
7221 SDLoc DL(Op);
7222 SDValue Vec = Op.getOperand(0);
7223 SDValue Mask = Op.getOperand(1);
7224 SDValue Passthru = Op.getOperand(2);
7225 EVT VecVT = Vec.getValueType();
7226 EVT MaskVT = Mask.getValueType();
7227 EVT ElmtVT = VecVT.getVectorElementType();
7228 const bool IsFixedLength = VecVT.isFixedLengthVector();
7229 const bool HasPassthru = !Passthru.isUndef();
7230 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7231 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7232
7233 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7234
7235 if (!Subtarget->isSVEAvailable())
7236 return SDValue();
7237
7238 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7239 return SDValue();
7240
7241 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7242 if (MinElmts != 2 && MinElmts != 4)
7243 return SDValue();
7244
7245 // We can use the SVE register containing the NEON vector in its lowest bits.
7246 if (IsFixedLength) {
7247 EVT ScalableVecVT =
7248 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7249 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7250 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7251
7252 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7253 DAG.getUNDEF(ScalableVecVT), Vec,
7254 DAG.getConstant(0, DL, MVT::i64));
7255 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7256 DAG.getUNDEF(ScalableMaskVT), Mask,
7257 DAG.getConstant(0, DL, MVT::i64));
7259 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7260 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7261 DAG.getUNDEF(ScalableVecVT), Passthru,
7262 DAG.getConstant(0, DL, MVT::i64));
7263
7264 VecVT = Vec.getValueType();
7265 MaskVT = Mask.getValueType();
7266 }
7267
7268 // Get legal type for compact instruction
7269 EVT ContainerVT = getSVEContainerType(VecVT);
7270 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7271
7272 // Convert to i32 or i64 for smaller types, as these are the only supported
7273 // sizes for compact.
7274 if (ContainerVT != VecVT) {
7275 Vec = DAG.getBitcast(CastVT, Vec);
7276 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7277 }
7278
7279 SDValue Compressed = DAG.getNode(
7281 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7282
7283 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7284 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7285 SDValue Offset = DAG.getNode(
7286 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7287 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7288
7289 SDValue IndexMask = DAG.getNode(
7290 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7291 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7292 DAG.getConstant(0, DL, MVT::i64), Offset);
7293
7294 Compressed =
7295 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7296 }
7297
7298 // Extracting from a legal SVE type before truncating produces better code.
7299 if (IsFixedLength) {
7300 Compressed = DAG.getNode(
7302 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7303 Compressed, DAG.getConstant(0, DL, MVT::i64));
7304 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7305 VecVT = FixedVecVT;
7306 }
7307
7308 // If we changed the element type before, we need to convert it back.
7309 if (ContainerVT != VecVT) {
7310 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7311 Compressed = DAG.getBitcast(VecVT, Compressed);
7312 }
7313
7314 return Compressed;
7315}
7316
7317// Generate SUBS and CSEL for integer abs.
7318SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7319 MVT VT = Op.getSimpleValueType();
7320
7321 if (VT.isVector())
7322 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7323
7324 SDLoc DL(Op);
7325 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7326 Op.getOperand(0));
7327 // Generate SUBS & CSEL.
7328 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7329 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7330 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7331 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7332}
7333
7335 SDValue Chain = Op.getOperand(0);
7336 SDValue Cond = Op.getOperand(1);
7337 SDValue Dest = Op.getOperand(2);
7338
7340 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7341 SDLoc DL(Op);
7342 SDValue CCVal = getCondCode(DAG, CC);
7343 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7344 Cmp);
7345 }
7346
7347 return SDValue();
7348}
7349
7350// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7351// FSHL is converted to FSHR before deciding what to do with it
7353 SDValue Shifts = Op.getOperand(2);
7354 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7355 // If opcode is FSHL, convert it to FSHR
7356 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7357 SDLoc DL(Op);
7358 MVT VT = Op.getSimpleValueType();
7359 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7360
7361 if (Op.getOpcode() == ISD::FSHL) {
7362 if (NewShiftNo == 0)
7363 return Op.getOperand(0);
7364
7365 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7366 return DAG.getNode(
7367 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7368 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7369 }
7370
7371 if (Op.getOpcode() == ISD::FSHR) {
7372 if (NewShiftNo == 0)
7373 return Op.getOperand(1);
7374
7375 if (ShiftNo->getZExtValue() == NewShiftNo)
7376 return Op;
7377
7378 // Rewrite using the normalised shift amount.
7379 return DAG.getNode(
7380 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7381 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7382 }
7383 }
7384
7385 return SDValue();
7386}
7387
7389 SDValue X = Op.getOperand(0);
7390 EVT XScalarTy = X.getValueType();
7391 SDValue Exp = Op.getOperand(1);
7392
7393 SDLoc DL(Op);
7394 EVT XVT, ExpVT;
7395 switch (Op.getSimpleValueType().SimpleTy) {
7396 default:
7397 return SDValue();
7398 case MVT::bf16:
7399 case MVT::f16:
7400 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7401 [[fallthrough]];
7402 case MVT::f32:
7403 XVT = MVT::nxv4f32;
7404 ExpVT = MVT::nxv4i32;
7405 break;
7406 case MVT::f64:
7407 XVT = MVT::nxv2f64;
7408 ExpVT = MVT::nxv2i64;
7409 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7410 break;
7411 }
7412
7413 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7414 SDValue VX =
7415 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7416 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7417 DAG.getUNDEF(ExpVT), Exp, Zero);
7418 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7419 AArch64SVEPredPattern::all);
7420 SDValue FScale =
7422 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7423 VPg, VX, VExp);
7424 SDValue Final =
7425 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7426 if (X.getValueType() != XScalarTy)
7427 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7428 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7429 return Final;
7430}
7431
7432SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7433 SelectionDAG &DAG) const {
7434 return Op.getOperand(0);
7435}
7436
7437SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7438 SelectionDAG &DAG) const {
7439 SDValue Chain = Op.getOperand(0);
7440 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7441 SDValue FPtr = Op.getOperand(2); // nested function
7442 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7443
7444 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7445
7446 // ldr NestReg, .+16
7447 // ldr x17, .+20
7448 // br x17
7449 // .word 0
7450 // .nest: .qword nest
7451 // .fptr: .qword fptr
7452 SDValue OutChains[5];
7453
7454 const Function *Func =
7455 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7456 CallingConv::ID CC = Func->getCallingConv();
7457 unsigned NestReg;
7458
7459 switch (CC) {
7460 default:
7461 NestReg = 0x0f; // X15
7462 break;
7464 // Must be kept in sync with AArch64CallingConv.td
7465 NestReg = 0x04; // X4
7466 break;
7467 }
7468
7469 const char FptrReg = 0x11; // X17
7470
7471 SDValue Addr = Trmp;
7472
7473 SDLoc DL(Op);
7474 OutChains[0] = DAG.getStore(
7475 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7476 MachinePointerInfo(TrmpAddr));
7477
7478 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7479 DAG.getConstant(4, DL, MVT::i64));
7480 OutChains[1] = DAG.getStore(
7481 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7482 MachinePointerInfo(TrmpAddr, 4));
7483
7484 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7485 DAG.getConstant(8, DL, MVT::i64));
7486 OutChains[2] =
7487 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7488 MachinePointerInfo(TrmpAddr, 8));
7489
7490 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7491 DAG.getConstant(16, DL, MVT::i64));
7492 OutChains[3] =
7493 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7494
7495 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7496 DAG.getConstant(24, DL, MVT::i64));
7497 OutChains[4] =
7498 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7499
7500 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7501
7502 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7503 DAG.getConstant(12, DL, MVT::i64));
7504
7505 // Call clear cache on the trampoline instructions.
7506 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7507 EndOfTrmp);
7508}
7509
7511 SelectionDAG &DAG) const {
7512 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7513 LLVM_DEBUG(Op.dump());
7514
7515 switch (Op.getOpcode()) {
7516 default:
7517 llvm_unreachable("unimplemented operand");
7518 return SDValue();
7521 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7522 case ISD::BITCAST:
7523 return LowerBITCAST(Op, DAG);
7524 case ISD::GlobalAddress:
7525 return LowerGlobalAddress(Op, DAG);
7527 return LowerGlobalTLSAddress(Op, DAG);
7529 return LowerPtrAuthGlobalAddress(Op, DAG);
7531 return LowerADJUST_TRAMPOLINE(Op, DAG);
7533 return LowerINIT_TRAMPOLINE(Op, DAG);
7534 case ISD::SETCC:
7535 case ISD::STRICT_FSETCC:
7537 return LowerSETCC(Op, DAG);
7538 case ISD::SETCCCARRY:
7539 return LowerSETCCCARRY(Op, DAG);
7540 case ISD::BRCOND:
7541 return LowerBRCOND(Op, DAG);
7542 case ISD::BR_CC:
7543 return LowerBR_CC(Op, DAG);
7544 case ISD::SELECT:
7545 return LowerSELECT(Op, DAG);
7546 case ISD::SELECT_CC:
7547 return LowerSELECT_CC(Op, DAG);
7548 case ISD::JumpTable:
7549 return LowerJumpTable(Op, DAG);
7550 case ISD::BR_JT:
7551 return LowerBR_JT(Op, DAG);
7552 case ISD::BRIND:
7553 return LowerBRIND(Op, DAG);
7554 case ISD::ConstantPool:
7555 return LowerConstantPool(Op, DAG);
7556 case ISD::BlockAddress:
7557 return LowerBlockAddress(Op, DAG);
7558 case ISD::VASTART:
7559 return LowerVASTART(Op, DAG);
7560 case ISD::VACOPY:
7561 return LowerVACOPY(Op, DAG);
7562 case ISD::VAARG:
7563 return LowerVAARG(Op, DAG);
7564 case ISD::UADDO_CARRY:
7565 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7566 case ISD::USUBO_CARRY:
7567 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7568 case ISD::SADDO_CARRY:
7569 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7570 case ISD::SSUBO_CARRY:
7571 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7572 case ISD::SADDO:
7573 case ISD::UADDO:
7574 case ISD::SSUBO:
7575 case ISD::USUBO:
7576 case ISD::SMULO:
7577 case ISD::UMULO:
7578 return LowerXALUO(Op, DAG);
7579 case ISD::FADD:
7580 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7581 case ISD::FSUB:
7582 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7583 case ISD::FMUL:
7584 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7585 case ISD::FMA:
7586 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7587 case ISD::FDIV:
7588 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7589 case ISD::FNEG:
7590 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7591 case ISD::FCEIL:
7592 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7593 case ISD::FFLOOR:
7594 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7595 case ISD::FNEARBYINT:
7596 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7597 case ISD::FRINT:
7598 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7599 case ISD::FROUND:
7600 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7601 case ISD::FROUNDEVEN:
7602 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7603 case ISD::FTRUNC:
7604 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7605 case ISD::FSQRT:
7606 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7607 case ISD::FABS:
7608 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7609 case ISD::FP_ROUND:
7611 return LowerFP_ROUND(Op, DAG);
7612 case ISD::FP_EXTEND:
7614 return LowerFP_EXTEND(Op, DAG);
7615 case ISD::FRAMEADDR:
7616 return LowerFRAMEADDR(Op, DAG);
7617 case ISD::SPONENTRY:
7618 return LowerSPONENTRY(Op, DAG);
7619 case ISD::RETURNADDR:
7620 return LowerRETURNADDR(Op, DAG);
7622 return LowerADDROFRETURNADDR(Op, DAG);
7624 return LowerCONCAT_VECTORS(Op, DAG);
7626 return LowerINSERT_VECTOR_ELT(Op, DAG);
7628 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7629 case ISD::BUILD_VECTOR:
7630 return LowerBUILD_VECTOR(Op, DAG);
7632 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7634 return LowerVECTOR_SHUFFLE(Op, DAG);
7635 case ISD::SPLAT_VECTOR:
7636 return LowerSPLAT_VECTOR(Op, DAG);
7638 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7640 return LowerINSERT_SUBVECTOR(Op, DAG);
7641 case ISD::SDIV:
7642 case ISD::UDIV:
7643 return LowerDIV(Op, DAG);
7644 case ISD::SMIN:
7645 case ISD::UMIN:
7646 case ISD::SMAX:
7647 case ISD::UMAX:
7648 return LowerMinMax(Op, DAG);
7649 case ISD::SRA:
7650 case ISD::SRL:
7651 case ISD::SHL:
7652 return LowerVectorSRA_SRL_SHL(Op, DAG);
7653 case ISD::SHL_PARTS:
7654 case ISD::SRL_PARTS:
7655 case ISD::SRA_PARTS:
7656 return LowerShiftParts(Op, DAG);
7657 case ISD::CTPOP:
7658 case ISD::PARITY:
7659 return LowerCTPOP_PARITY(Op, DAG);
7660 case ISD::FCOPYSIGN:
7661 return LowerFCOPYSIGN(Op, DAG);
7662 case ISD::OR:
7663 return LowerVectorOR(Op, DAG);
7664 case ISD::XOR:
7665 return LowerXOR(Op, DAG);
7666 case ISD::PREFETCH:
7667 return LowerPREFETCH(Op, DAG);
7668 case ISD::SINT_TO_FP:
7669 case ISD::UINT_TO_FP:
7672 return LowerINT_TO_FP(Op, DAG);
7673 case ISD::FP_TO_SINT:
7674 case ISD::FP_TO_UINT:
7677 return LowerFP_TO_INT(Op, DAG);
7680 return LowerFP_TO_INT_SAT(Op, DAG);
7681 case ISD::FSINCOS:
7682 return LowerFSINCOS(Op, DAG);
7683 case ISD::GET_ROUNDING:
7684 return LowerGET_ROUNDING(Op, DAG);
7685 case ISD::SET_ROUNDING:
7686 return LowerSET_ROUNDING(Op, DAG);
7687 case ISD::GET_FPMODE:
7688 return LowerGET_FPMODE(Op, DAG);
7689 case ISD::SET_FPMODE:
7690 return LowerSET_FPMODE(Op, DAG);
7691 case ISD::RESET_FPMODE:
7692 return LowerRESET_FPMODE(Op, DAG);
7693 case ISD::MUL:
7694 return LowerMUL(Op, DAG);
7695 case ISD::MULHS:
7696 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7697 case ISD::MULHU:
7698 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7700 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7702 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7704 return LowerINTRINSIC_VOID(Op, DAG);
7705 case ISD::ATOMIC_STORE:
7706 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7707 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7708 return LowerStore128(Op, DAG);
7709 }
7710 return SDValue();
7711 case ISD::STORE:
7712 return LowerSTORE(Op, DAG);
7713 case ISD::MSTORE:
7714 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7715 case ISD::MGATHER:
7716 return LowerMGATHER(Op, DAG);
7717 case ISD::MSCATTER:
7718 return LowerMSCATTER(Op, DAG);
7720 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7721 case ISD::VECREDUCE_ADD:
7722 case ISD::VECREDUCE_AND:
7723 case ISD::VECREDUCE_OR:
7724 case ISD::VECREDUCE_XOR:
7734 return LowerVECREDUCE(Op, DAG);
7736 return LowerATOMIC_LOAD_AND(Op, DAG);
7738 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7739 case ISD::VSCALE:
7740 return LowerVSCALE(Op, DAG);
7742 return LowerVECTOR_COMPRESS(Op, DAG);
7743 case ISD::ANY_EXTEND:
7744 case ISD::SIGN_EXTEND:
7745 case ISD::ZERO_EXTEND:
7746 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7747 case ISD::ADDRSPACECAST:
7748 return LowerADDRSPACECAST(Op, DAG);
7750 // Only custom lower when ExtraVT has a legal byte based element type.
7751 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7752 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7753 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7754 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7755 return SDValue();
7756
7757 return LowerToPredicatedOp(Op, DAG,
7758 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7759 }
7760 case ISD::TRUNCATE:
7761 return LowerTRUNCATE(Op, DAG);
7762 case ISD::MLOAD:
7763 return LowerMLOAD(Op, DAG);
7764 case ISD::LOAD:
7765 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7766 !Subtarget->isNeonAvailable()))
7767 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7768 return LowerLOAD(Op, DAG);
7769 case ISD::ADD:
7770 case ISD::AND:
7771 case ISD::SUB:
7772 return LowerToScalableOp(Op, DAG);
7773 case ISD::FMAXIMUM:
7774 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7775 case ISD::FMAXNUM:
7776 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7777 case ISD::FMINIMUM:
7778 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7779 case ISD::FMINNUM:
7780 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7781 case ISD::VSELECT:
7782 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7783 case ISD::ABS:
7784 return LowerABS(Op, DAG);
7785 case ISD::ABDS:
7786 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7787 case ISD::ABDU:
7788 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7789 case ISD::AVGFLOORS:
7790 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7791 case ISD::AVGFLOORU:
7792 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7793 case ISD::AVGCEILS:
7794 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7795 case ISD::AVGCEILU:
7796 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7797 case ISD::BITREVERSE:
7798 return LowerBitreverse(Op, DAG);
7799 case ISD::BSWAP:
7800 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7801 case ISD::CTLZ:
7802 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7803 case ISD::CTTZ:
7804 return LowerCTTZ(Op, DAG);
7805 case ISD::VECTOR_SPLICE:
7806 return LowerVECTOR_SPLICE(Op, DAG);
7808 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7810 return LowerVECTOR_INTERLEAVE(Op, DAG);
7812 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7813 case ISD::LRINT:
7814 case ISD::LLRINT:
7815 if (Op.getValueType().isVector())
7816 return LowerVectorXRINT(Op, DAG);
7817 [[fallthrough]];
7818 case ISD::LROUND:
7819 case ISD::LLROUND: {
7820 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7821 Op.getOperand(0).getValueType() == MVT::bf16) &&
7822 "Expected custom lowering of rounding operations only for f16");
7823 SDLoc DL(Op);
7824 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7825 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7826 }
7827 case ISD::STRICT_LROUND:
7829 case ISD::STRICT_LRINT:
7830 case ISD::STRICT_LLRINT: {
7831 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7832 Op.getOperand(1).getValueType() == MVT::bf16) &&
7833 "Expected custom lowering of rounding operations only for f16");
7834 SDLoc DL(Op);
7835 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7836 {Op.getOperand(0), Op.getOperand(1)});
7837 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7838 {Ext.getValue(1), Ext.getValue(0)});
7839 }
7840 case ISD::WRITE_REGISTER: {
7841 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7842 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7843 SDLoc DL(Op);
7844
7845 SDValue Chain = Op.getOperand(0);
7846 SDValue SysRegName = Op.getOperand(1);
7847 std::pair<SDValue, SDValue> Pair =
7848 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7849
7850 // chain = MSRR(chain, sysregname, lo, hi)
7851 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7852 SysRegName, Pair.first, Pair.second);
7853
7854 return Result;
7855 }
7856 case ISD::FSHL:
7857 case ISD::FSHR:
7858 return LowerFunnelShift(Op, DAG);
7859 case ISD::FLDEXP:
7860 return LowerFLDEXP(Op, DAG);
7862 return LowerVECTOR_HISTOGRAM(Op, DAG);
7866 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7867 }
7868}
7869
7871 return !Subtarget->useSVEForFixedLengthVectors();
7872}
7873
7875 EVT VT, bool OverrideNEON) const {
7876 if (!VT.isFixedLengthVector() || !VT.isSimple())
7877 return false;
7878
7879 // Don't use SVE for vectors we cannot scalarize if required.
7880 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7881 // Fixed length predicates should be promoted to i8.
7882 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7883 case MVT::i1:
7884 default:
7885 return false;
7886 case MVT::i8:
7887 case MVT::i16:
7888 case MVT::i32:
7889 case MVT::i64:
7890 case MVT::f16:
7891 case MVT::f32:
7892 case MVT::f64:
7893 break;
7894 }
7895
7896 // NEON-sized vectors can be emulated using SVE instructions.
7897 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7898 return Subtarget->isSVEorStreamingSVEAvailable();
7899
7900 // Ensure NEON MVTs only belong to a single register class.
7901 if (VT.getFixedSizeInBits() <= 128)
7902 return false;
7903
7904 // Ensure wider than NEON code generation is enabled.
7905 if (!Subtarget->useSVEForFixedLengthVectors())
7906 return false;
7907
7908 // Don't use SVE for types that don't fit.
7909 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7910 return false;
7911
7912 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7913 // the base fixed length SVE support in place.
7914 if (!VT.isPow2VectorType())
7915 return false;
7916
7917 return true;
7918}
7919
7920//===----------------------------------------------------------------------===//
7921// Calling Convention Implementation
7922//===----------------------------------------------------------------------===//
7923
7924static unsigned getIntrinsicID(const SDNode *N) {
7925 unsigned Opcode = N->getOpcode();
7926 switch (Opcode) {
7927 default:
7930 unsigned IID = N->getConstantOperandVal(0);
7931 if (IID < Intrinsic::num_intrinsics)
7932 return IID;
7934 }
7935 }
7936}
7937
7939 SDValue N1) const {
7940 if (!N0.hasOneUse())
7941 return false;
7942
7943 unsigned IID = getIntrinsicID(N1.getNode());
7944 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7945 if (IID == Intrinsic::aarch64_neon_umull ||
7946 N1.getOpcode() == AArch64ISD::UMULL ||
7947 IID == Intrinsic::aarch64_neon_smull ||
7948 N1.getOpcode() == AArch64ISD::SMULL)
7949 return N0.getOpcode() != ISD::ADD;
7950
7951 return true;
7952}
7953
7954/// Selects the correct CCAssignFn for a given CallingConvention value.
7956 bool IsVarArg) const {
7957 switch (CC) {
7958 default:
7959 reportFatalUsageError("unsupported calling convention");
7960 case CallingConv::GHC:
7961 return CC_AArch64_GHC;
7963 // The VarArg implementation makes assumptions about register
7964 // argument passing that do not hold for preserve_none, so we
7965 // instead fall back to C argument passing.
7966 // The non-vararg case is handled in the CC function itself.
7967 if (!IsVarArg)
7969 [[fallthrough]];
7970 case CallingConv::C:
7971 case CallingConv::Fast:
7975 case CallingConv::Swift:
7977 case CallingConv::Tail:
7978 case CallingConv::GRAAL:
7979 if (Subtarget->isTargetWindows()) {
7980 if (IsVarArg) {
7981 if (Subtarget->isWindowsArm64EC())
7984 }
7985 return CC_AArch64_Win64PCS;
7986 }
7987 if (!Subtarget->isTargetDarwin())
7988 return CC_AArch64_AAPCS;
7989 if (!IsVarArg)
7990 return CC_AArch64_DarwinPCS;
7993 case CallingConv::Win64:
7994 if (IsVarArg) {
7995 if (Subtarget->isWindowsArm64EC())
7998 }
7999 return CC_AArch64_Win64PCS;
8001 if (Subtarget->isWindowsArm64EC())
8009 return CC_AArch64_AAPCS;
8014 }
8015}
8016
8017CCAssignFn *
8019 switch (CC) {
8020 default:
8021 return RetCC_AArch64_AAPCS;
8025 if (Subtarget->isWindowsArm64EC())
8027 return RetCC_AArch64_AAPCS;
8028 }
8029}
8030
8031static bool isPassedInFPR(EVT VT) {
8032 return VT.isFixedLengthVector() ||
8033 (VT.isFloatingPoint() && !VT.isScalableVector());
8034}
8035
8036SDValue AArch64TargetLowering::LowerFormalArguments(
8037 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8038 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8039 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8041 const Function &F = MF.getFunction();
8042 MachineFrameInfo &MFI = MF.getFrameInfo();
8043 bool IsWin64 =
8044 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8045 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8046 (isVarArg && Subtarget->isWindowsArm64EC());
8048
8050 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8052 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8053 FuncInfo->setIsSVECC(true);
8054
8055 // Assign locations to all of the incoming arguments.
8057 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8058
8059 // At this point, Ins[].VT may already be promoted to i32. To correctly
8060 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8061 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8062 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8063 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8064 // LocVT.
8065 unsigned NumArgs = Ins.size();
8066 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8067 unsigned CurArgIdx = 0;
8068 bool UseVarArgCC = false;
8069 if (IsWin64)
8070 UseVarArgCC = isVarArg;
8071
8072 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8073
8074 for (unsigned i = 0; i != NumArgs; ++i) {
8075 MVT ValVT = Ins[i].VT;
8076 if (Ins[i].isOrigArg()) {
8077 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8078 CurArgIdx = Ins[i].getOrigArgIndex();
8079
8080 // Get type of the original argument.
8081 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8082 /*AllowUnknown*/ true);
8083 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8084 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8085 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8086 ValVT = MVT::i8;
8087 else if (ActualMVT == MVT::i16)
8088 ValVT = MVT::i16;
8089 }
8090 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8091 Ins[i].OrigTy, CCInfo);
8092 assert(!Res && "Call operand has unhandled type");
8093 (void)Res;
8094 }
8095
8096 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8097 bool IsLocallyStreaming =
8098 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8099 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8100 SDValue Glue = Chain.getValue(1);
8101
8102 unsigned ExtraArgLocs = 0;
8103 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8104 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8105
8106 if (Ins[i].Flags.isByVal()) {
8107 // Byval is used for HFAs in the PCS, but the system should work in a
8108 // non-compliant manner for larger structs.
8109 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8110 int Size = Ins[i].Flags.getByValSize();
8111 unsigned NumRegs = (Size + 7) / 8;
8112
8113 // FIXME: This works on big-endian for composite byvals, which are the common
8114 // case. It should also work for fundamental types too.
8115 unsigned FrameIdx =
8116 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8117 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8118 InVals.push_back(FrameIdxN);
8119
8120 continue;
8121 }
8122
8123 if (Ins[i].Flags.isSwiftAsync())
8125
8126 SDValue ArgValue;
8127 if (VA.isRegLoc()) {
8128 // Arguments stored in registers.
8129 EVT RegVT = VA.getLocVT();
8130 const TargetRegisterClass *RC;
8131
8132 if (RegVT == MVT::i32)
8133 RC = &AArch64::GPR32RegClass;
8134 else if (RegVT == MVT::i64)
8135 RC = &AArch64::GPR64RegClass;
8136 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8137 RC = &AArch64::FPR16RegClass;
8138 else if (RegVT == MVT::f32)
8139 RC = &AArch64::FPR32RegClass;
8140 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8141 RC = &AArch64::FPR64RegClass;
8142 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8143 RC = &AArch64::FPR128RegClass;
8144 else if (RegVT.isScalableVector() &&
8145 RegVT.getVectorElementType() == MVT::i1) {
8146 FuncInfo->setIsSVECC(true);
8147 RC = &AArch64::PPRRegClass;
8148 } else if (RegVT == MVT::aarch64svcount) {
8149 FuncInfo->setIsSVECC(true);
8150 RC = &AArch64::PPRRegClass;
8151 } else if (RegVT.isScalableVector()) {
8152 FuncInfo->setIsSVECC(true);
8153 RC = &AArch64::ZPRRegClass;
8154 } else
8155 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8156
8157 // Transform the arguments in physical registers into virtual ones.
8158 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8159
8160 if (IsLocallyStreaming) {
8161 // LocallyStreamingFunctions must insert the SMSTART in the correct
8162 // position, so we use Glue to ensure no instructions can be scheduled
8163 // between the chain of:
8164 // t0: ch,glue = EntryNode
8165 // t1: res,ch,glue = CopyFromReg
8166 // ...
8167 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8168 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8169 // ^^^^^^
8170 // This will be the new Chain/Root node.
8171 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8172 Glue = ArgValue.getValue(2);
8173 if (isPassedInFPR(ArgValue.getValueType())) {
8174 ArgValue =
8175 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8176 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8177 {ArgValue, Glue});
8178 Glue = ArgValue.getValue(1);
8179 }
8180 } else
8181 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8182
8183 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8184 // to 64 bits. Insert an assert[sz]ext to capture this, then
8185 // truncate to the right size.
8186 switch (VA.getLocInfo()) {
8187 default:
8188 llvm_unreachable("Unknown loc info!");
8189 case CCValAssign::Full:
8190 break;
8192 assert(
8193 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8194 "Indirect arguments should be scalable on most subtargets");
8195 break;
8196 case CCValAssign::BCvt:
8197 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8198 break;
8199 case CCValAssign::AExt:
8200 case CCValAssign::SExt:
8201 case CCValAssign::ZExt:
8202 break;
8204 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8205 DAG.getConstant(32, DL, RegVT));
8206 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8207 break;
8208 }
8209 } else { // VA.isRegLoc()
8210 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8211 unsigned ArgOffset = VA.getLocMemOffset();
8212 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8213 ? VA.getLocVT().getSizeInBits()
8214 : VA.getValVT().getSizeInBits()) / 8;
8215
8216 uint32_t BEAlign = 0;
8217 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8218 !Ins[i].Flags.isInConsecutiveRegs())
8219 BEAlign = 8 - ArgSize;
8220
8221 SDValue FIN;
8222 MachinePointerInfo PtrInfo;
8223 if (StackViaX4) {
8224 // In both the ARM64EC varargs convention and the thunk convention,
8225 // arguments on the stack are accessed relative to x4, not sp. In
8226 // the thunk convention, there's an additional offset of 32 bytes
8227 // to account for the shadow store.
8228 unsigned ObjOffset = ArgOffset + BEAlign;
8229 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8230 ObjOffset += 32;
8231 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8232 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8233 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8234 DAG.getConstant(ObjOffset, DL, MVT::i64));
8236 } else {
8237 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8238
8239 // Create load nodes to retrieve arguments from the stack.
8240 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8241 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8242 }
8243
8244 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8246 MVT MemVT = VA.getValVT();
8247
8248 switch (VA.getLocInfo()) {
8249 default:
8250 break;
8251 case CCValAssign::Trunc:
8252 case CCValAssign::BCvt:
8253 MemVT = VA.getLocVT();
8254 break;
8257 Subtarget->isWindowsArm64EC()) &&
8258 "Indirect arguments should be scalable on most subtargets");
8259 MemVT = VA.getLocVT();
8260 break;
8261 case CCValAssign::SExt:
8262 ExtType = ISD::SEXTLOAD;
8263 break;
8264 case CCValAssign::ZExt:
8265 ExtType = ISD::ZEXTLOAD;
8266 break;
8267 case CCValAssign::AExt:
8268 ExtType = ISD::EXTLOAD;
8269 break;
8270 }
8271
8272 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8273 MemVT);
8274 }
8275
8276 if (VA.getLocInfo() == CCValAssign::Indirect) {
8277 assert((VA.getValVT().isScalableVT() ||
8278 Subtarget->isWindowsArm64EC()) &&
8279 "Indirect arguments should be scalable on most subtargets");
8280
8281 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8282 unsigned NumParts = 1;
8283 if (Ins[i].Flags.isInConsecutiveRegs()) {
8284 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8285 ++NumParts;
8286 }
8287
8288 MVT PartLoad = VA.getValVT();
8289 SDValue Ptr = ArgValue;
8290
8291 // Ensure we generate all loads for each tuple part, whilst updating the
8292 // pointer after each load correctly using vscale.
8293 while (NumParts > 0) {
8294 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8295 InVals.push_back(ArgValue);
8296 NumParts--;
8297 if (NumParts > 0) {
8298 SDValue BytesIncrement;
8299 if (PartLoad.isScalableVector()) {
8300 BytesIncrement = DAG.getVScale(
8301 DL, Ptr.getValueType(),
8302 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8303 } else {
8304 BytesIncrement = DAG.getConstant(
8305 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8306 Ptr.getValueType());
8307 }
8308 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8309 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8310 ExtraArgLocs++;
8311 i++;
8312 }
8313 }
8314 } else {
8315 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8316 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8317 ArgValue, DAG.getValueType(MVT::i32));
8318
8319 // i1 arguments are zero-extended to i8 by the caller. Emit a
8320 // hint to reflect this.
8321 if (Ins[i].isOrigArg()) {
8322 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8323 if (OrigArg->getType()->isIntegerTy(1)) {
8324 if (!Ins[i].Flags.isZExt()) {
8325 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8326 ArgValue.getValueType(), ArgValue);
8327 }
8328 }
8329 }
8330
8331 InVals.push_back(ArgValue);
8332 }
8333 }
8334 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8335
8336 if (Attrs.hasStreamingCompatibleInterface()) {
8337 SDValue EntryPStateSM =
8338 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8339 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8340
8341 // Copy the value to a virtual register, and save that in FuncInfo.
8342 Register EntryPStateSMReg =
8343 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8344 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8345 EntryPStateSM);
8346 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8347 }
8348
8349 // Insert the SMSTART if this is a locally streaming function and
8350 // make sure it is Glued to the last CopyFromReg value.
8351 if (IsLocallyStreaming) {
8352 if (Attrs.hasStreamingCompatibleInterface())
8353 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8355 else
8356 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8358
8359 // Ensure that the SMSTART happens after the CopyWithChain such that its
8360 // chain result is used.
8361 for (unsigned I=0; I<InVals.size(); ++I) {
8364 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8365 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8366 InVals[I].getValueType());
8367 }
8368 }
8369
8370 // varargs
8371 if (isVarArg) {
8373 if (!Subtarget->isTargetDarwin() || IsWin64) {
8374 // The AAPCS variadic function ABI is identical to the non-variadic
8375 // one. As a result there may be more arguments in registers and we
8376 // should save them for future reference.
8377 // Win64 variadic functions also pass arguments in registers, but all
8378 // float arguments are passed in integer registers.
8379 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8380 }
8381
8382 // This will point to the next argument passed via stack.
8383 unsigned VarArgsOffset = CCInfo.getStackSize();
8384 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8385 VarArgsOffset =
8386 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8387 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8388 FuncInfo->setVarArgsStackIndex(
8389 MFI.CreateFixedObject(4, VarArgsOffset, true));
8390 }
8391
8392 if (MFI.hasMustTailInVarArgFunc()) {
8393 SmallVector<MVT, 2> RegParmTypes;
8394 RegParmTypes.push_back(MVT::i64);
8395 RegParmTypes.push_back(MVT::f128);
8396 // Compute the set of forwarded registers. The rest are scratch.
8398 FuncInfo->getForwardedMustTailRegParms();
8399 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8401
8402 // Conservatively forward X8, since it might be used for aggregate return.
8403 if (!CCInfo.isAllocated(AArch64::X8)) {
8404 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8405 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8406 }
8407 }
8408 }
8409
8410 // On Windows, InReg pointers must be returned, so record the pointer in a
8411 // virtual register at the start of the function so it can be returned in the
8412 // epilogue.
8413 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8414 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8415 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8416 Ins[I].Flags.isInReg()) &&
8417 Ins[I].Flags.isSRet()) {
8418 assert(!FuncInfo->getSRetReturnReg());
8419
8420 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8421 Register Reg =
8423 FuncInfo->setSRetReturnReg(Reg);
8424
8425 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8426 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8427 break;
8428 }
8429 }
8430 }
8431
8432 unsigned StackArgSize = CCInfo.getStackSize();
8433 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8434 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8435 // This is a non-standard ABI so by fiat I say we're allowed to make full
8436 // use of the stack area to be popped, which must be aligned to 16 bytes in
8437 // any case:
8438 StackArgSize = alignTo(StackArgSize, 16);
8439
8440 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8441 // a multiple of 16.
8442 FuncInfo->setArgumentStackToRestore(StackArgSize);
8443
8444 // This realignment carries over to the available bytes below. Our own
8445 // callers will guarantee the space is free by giving an aligned value to
8446 // CALLSEQ_START.
8447 }
8448 // Even if we're not expected to free up the space, it's useful to know how
8449 // much is there while considering tail calls (because we can reuse it).
8450 FuncInfo->setBytesInStackArgArea(StackArgSize);
8451
8452 if (Subtarget->hasCustomCallingConv())
8454
8455 if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
8456 // Old SME ABI lowering (deprecated):
8457 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8458 // will be expanded and stored in the static object later using a
8459 // pseudonode.
8460 if (Attrs.hasZAState()) {
8461 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8462 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8463 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8464 DAG.getConstant(1, DL, MVT::i32));
8465 SDValue Buffer;
8466 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8467 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8468 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8469 } else {
8470 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8471 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8472 DAG.getVTList(MVT::i64, MVT::Other),
8473 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8474 MFI.CreateVariableSizedObject(Align(16), nullptr);
8475 }
8476 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8477 DAG.getConstant(1, DL, MVT::i32));
8478 Chain = DAG.getNode(
8479 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8480 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8481 /*Num save slices*/ NumZaSaveSlices});
8482 } else if (Attrs.hasAgnosticZAInterface()) {
8483 // Call __arm_sme_state_size().
8484 SDValue BufferSize =
8485 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8486 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8487 Chain = BufferSize.getValue(1);
8488 SDValue Buffer;
8489 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8490 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8491 DAG.getVTList(MVT::i64, MVT::Other),
8492 {Chain, BufferSize});
8493 } else {
8494 // Allocate space dynamically.
8495 Buffer = DAG.getNode(
8496 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8497 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8498 MFI.CreateVariableSizedObject(Align(16), nullptr);
8499 }
8500 // Copy the value to a virtual register, and save that in FuncInfo.
8501 Register BufferPtr =
8502 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8503 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8504 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8505 }
8506 }
8507
8508 if (CallConv == CallingConv::PreserveNone) {
8509 for (const ISD::InputArg &I : Ins) {
8510 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8511 I.Flags.isSwiftAsync()) {
8514 MF.getFunction(),
8515 "Swift attributes can't be used with preserve_none",
8516 DL.getDebugLoc()));
8517 break;
8518 }
8519 }
8520 }
8521
8522 if (getTM().useNewSMEABILowering()) {
8523 // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
8524 if (Attrs.isNewZT0())
8525 Chain = DAG.getNode(
8526 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8527 DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
8528 DAG.getTargetConstant(0, DL, MVT::i32));
8529 }
8530
8531 return Chain;
8532}
8533
8534void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8535 SelectionDAG &DAG,
8536 const SDLoc &DL,
8537 SDValue &Chain) const {
8539 MachineFrameInfo &MFI = MF.getFrameInfo();
8541 auto PtrVT = getPointerTy(DAG.getDataLayout());
8542 Function &F = MF.getFunction();
8543 bool IsWin64 =
8544 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8545
8547
8549 unsigned NumGPRArgRegs = GPRArgRegs.size();
8550 if (Subtarget->isWindowsArm64EC()) {
8551 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8552 // functions.
8553 NumGPRArgRegs = 4;
8554 }
8555 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8556
8557 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8558 int GPRIdx = 0;
8559 if (GPRSaveSize != 0) {
8560 if (IsWin64) {
8561 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8562 if (GPRSaveSize & 15)
8563 // The extra size here, if triggered, will always be 8.
8564 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8565 } else
8566 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8567
8568 SDValue FIN;
8569 if (Subtarget->isWindowsArm64EC()) {
8570 // With the Arm64EC ABI, we reserve the save area as usual, but we
8571 // compute its address relative to x4. For a normal AArch64->AArch64
8572 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8573 // different address.
8574 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8575 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8576 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8577 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8578 } else {
8579 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8580 }
8581
8582 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8583 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8584 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8585 SDValue Store =
8586 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8588 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8589 : MachinePointerInfo::getStack(MF, i * 8));
8590 MemOps.push_back(Store);
8591 FIN =
8592 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8593 }
8594 }
8595 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8596 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8597
8598 if (Subtarget->hasFPARMv8() && !IsWin64) {
8600 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8601 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8602
8603 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8604 int FPRIdx = 0;
8605 if (FPRSaveSize != 0) {
8606 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8607
8608 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8609
8610 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8611 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8612 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8613
8614 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8615 MachinePointerInfo::getStack(MF, i * 16));
8616 MemOps.push_back(Store);
8617 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8618 DAG.getConstant(16, DL, PtrVT));
8619 }
8620 }
8621 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8622 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8623 }
8624
8625 if (!MemOps.empty()) {
8626 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8627 }
8628}
8629
8630/// LowerCallResult - Lower the result values of a call into the
8631/// appropriate copies out of appropriate physical registers.
8632SDValue AArch64TargetLowering::LowerCallResult(
8633 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8634 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8635 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8636 SDValue ThisVal, bool RequiresSMChange) const {
8637 DenseMap<unsigned, SDValue> CopiedRegs;
8638 // Copy all of the result registers out of their specified physreg.
8639 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8640 CCValAssign VA = RVLocs[i];
8641
8642 // Pass 'this' value directly from the argument to return value, to avoid
8643 // reg unit interference
8644 if (i == 0 && isThisReturn) {
8645 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8646 "unexpected return calling convention register assignment");
8647 InVals.push_back(ThisVal);
8648 continue;
8649 }
8650
8651 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8652 // allows one use of a physreg per block.
8653 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8654 if (!Val) {
8655 Val =
8656 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8657 Chain = Val.getValue(1);
8658 InGlue = Val.getValue(2);
8659 CopiedRegs[VA.getLocReg()] = Val;
8660 }
8661
8662 switch (VA.getLocInfo()) {
8663 default:
8664 llvm_unreachable("Unknown loc info!");
8665 case CCValAssign::Full:
8666 break;
8667 case CCValAssign::BCvt:
8668 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8669 break;
8671 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8672 DAG.getConstant(32, DL, VA.getLocVT()));
8673 [[fallthrough]];
8674 case CCValAssign::AExt:
8675 [[fallthrough]];
8676 case CCValAssign::ZExt:
8677 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8678 break;
8679 }
8680
8681 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8682 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8683 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
8684
8685 InVals.push_back(Val);
8686 }
8687
8688 return Chain;
8689}
8690
8691/// Return true if the calling convention is one that we can guarantee TCO for.
8692static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8693 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8695}
8696
8697/// Return true if we might ever do TCO for calls with this calling convention.
8699 switch (CC) {
8700 case CallingConv::C:
8705 case CallingConv::Swift:
8707 case CallingConv::Tail:
8708 case CallingConv::Fast:
8709 return true;
8710 default:
8711 return false;
8712 }
8713}
8714
8715/// Return true if the call convention supports varargs
8716/// Currently only those that pass varargs like the C
8717/// calling convention does are eligible
8718/// Calling conventions listed in this function must also
8719/// be properly handled in AArch64Subtarget::isCallingConvWin64
8721 switch (CC) {
8722 case CallingConv::C:
8724 // SVE vector call is only partially supported, but it should
8725 // support named arguments being passed. Any arguments being passed
8726 // as varargs, are still unsupported.
8728 return true;
8729 default:
8730 return false;
8731 }
8732}
8733
8735 const AArch64Subtarget *Subtarget,
8737 CCState &CCInfo) {
8738 const SelectionDAG &DAG = CLI.DAG;
8739 CallingConv::ID CalleeCC = CLI.CallConv;
8740 bool IsVarArg = CLI.IsVarArg;
8741 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8742 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8743
8744 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8745 // for the shadow store.
8746 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8747 CCInfo.AllocateStack(32, Align(16));
8748
8749 unsigned NumArgs = Outs.size();
8750 for (unsigned i = 0; i != NumArgs; ++i) {
8751 MVT ArgVT = Outs[i].VT;
8752 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8753
8754 bool UseVarArgCC = false;
8755 if (IsVarArg) {
8756 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8757 // too, so use the vararg CC to force them to integer registers.
8758 if (IsCalleeWin64) {
8759 UseVarArgCC = true;
8760 } else {
8761 UseVarArgCC = ArgFlags.isVarArg();
8762 }
8763 }
8764
8765 if (!UseVarArgCC) {
8766 // Get type of the original argument.
8767 EVT ActualVT =
8768 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8769 /*AllowUnknown*/ true);
8770 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8771 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8772 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8773 ArgVT = MVT::i8;
8774 else if (ActualMVT == MVT::i16)
8775 ArgVT = MVT::i16;
8776 }
8777
8778 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8779 // argument. This logic should exactly mirror LowerFormalArguments.
8780 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8781 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
8782 Outs[i].OrigTy, CCInfo);
8783 assert(!Res && "Call operand has unhandled type");
8784 (void)Res;
8785 }
8786}
8787
8788static SMECallAttrs
8791 if (CLI.CB)
8792 return SMECallAttrs(*CLI.CB, &TLI);
8793 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8794 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
8796}
8797
8798bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8799 const CallLoweringInfo &CLI) const {
8800 CallingConv::ID CalleeCC = CLI.CallConv;
8801 if (!mayTailCallThisCC(CalleeCC))
8802 return false;
8803
8804 SDValue Callee = CLI.Callee;
8805 bool IsVarArg = CLI.IsVarArg;
8806 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8807 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8808 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8809 const SelectionDAG &DAG = CLI.DAG;
8811 const Function &CallerF = MF.getFunction();
8812 CallingConv::ID CallerCC = CallerF.getCallingConv();
8813
8814 // SME Streaming functions are not eligible for TCO as they may require
8815 // the streaming mode or ZA to be restored after returning from the call.
8816 SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
8817 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
8818 CallAttrs.requiresPreservingAllZAState() ||
8819 CallAttrs.caller().hasStreamingBody())
8820 return false;
8821
8822 // Functions using the C or Fast calling convention that have an SVE signature
8823 // preserve more registers and should assume the SVE_VectorCall CC.
8824 // The check for matching callee-saved regs will determine whether it is
8825 // eligible for TCO.
8826 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8829
8830 bool CCMatch = CallerCC == CalleeCC;
8831
8832 // When using the Windows calling convention on a non-windows OS, we want
8833 // to back up and restore X18 in such functions; we can't do a tail call
8834 // from those functions.
8835 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8836 CalleeCC != CallingConv::Win64)
8837 return false;
8838
8839 // Byval parameters hand the function a pointer directly into the stack area
8840 // we want to reuse during a tail call. Working around this *is* possible (see
8841 // X86) but less efficient and uglier in LowerCall.
8842 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8843 e = CallerF.arg_end();
8844 i != e; ++i) {
8845 if (i->hasByValAttr())
8846 return false;
8847
8848 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8849 // In this case, it is necessary to save X0/X1 in the callee and return it
8850 // in X0. Tail call opt may interfere with this, so we disable tail call
8851 // opt when the caller has an "inreg" attribute -- except if the callee
8852 // also has that attribute on the same argument, and the same value is
8853 // passed.
8854 if (i->hasInRegAttr()) {
8855 unsigned ArgIdx = i - CallerF.arg_begin();
8856 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
8857 return false;
8858 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
8859 if (!Attrs.hasAttribute(Attribute::InReg) ||
8860 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
8861 CLI.CB->getArgOperand(ArgIdx) != i) {
8862 return false;
8863 }
8864 }
8865 }
8866
8867 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8868 return CCMatch;
8869
8870 // Externally-defined functions with weak linkage should not be
8871 // tail-called on AArch64 when the OS does not support dynamic
8872 // pre-emption of symbols, as the AAELF spec requires normal calls
8873 // to undefined weak functions to be replaced with a NOP or jump to the
8874 // next instruction. The behaviour of branch instructions in this
8875 // situation (as used for tail calls) is implementation-defined, so we
8876 // cannot rely on the linker replacing the tail call with a return.
8877 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8878 const GlobalValue *GV = G->getGlobal();
8880 if (GV->hasExternalWeakLinkage() &&
8881 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8882 return false;
8883 }
8884
8885 // Now we search for cases where we can use a tail call without changing the
8886 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8887 // concept.
8888
8889 // I want anyone implementing a new calling convention to think long and hard
8890 // about this assert.
8891 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8892 report_fatal_error("Unsupported variadic calling convention");
8893
8894 LLVMContext &C = *DAG.getContext();
8895 // Check that the call results are passed in the same way.
8896 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8897 CCAssignFnForCall(CalleeCC, IsVarArg),
8898 CCAssignFnForCall(CallerCC, IsVarArg)))
8899 return false;
8900 // The callee has to preserve all registers the caller needs to preserve.
8901 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8902 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8903 if (!CCMatch) {
8904 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8905 if (Subtarget->hasCustomCallingConv()) {
8906 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8907 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8908 }
8909 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8910 return false;
8911 }
8912
8913 // Nothing more to check if the callee is taking no arguments
8914 if (Outs.empty())
8915 return true;
8916
8918 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8919
8920 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8921
8922 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8923 // When we are musttail, additional checks have been done and we can safely ignore this check
8924 // At least two cases here: if caller is fastcc then we can't have any
8925 // memory arguments (we'd be expected to clean up the stack afterwards). If
8926 // caller is C then we could potentially use its argument area.
8927
8928 // FIXME: for now we take the most conservative of these in both cases:
8929 // disallow all variadic memory operands.
8930 for (const CCValAssign &ArgLoc : ArgLocs)
8931 if (!ArgLoc.isRegLoc())
8932 return false;
8933 }
8934
8935 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8936
8937 // If any of the arguments is passed indirectly, it must be SVE, so the
8938 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8939 // allocate space on the stack. That is why we determine this explicitly here
8940 // the call cannot be a tailcall.
8941 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8942 assert((A.getLocInfo() != CCValAssign::Indirect ||
8943 A.getValVT().isScalableVector() ||
8944 Subtarget->isWindowsArm64EC()) &&
8945 "Expected value to be scalable");
8946 return A.getLocInfo() == CCValAssign::Indirect;
8947 }))
8948 return false;
8949
8950 // If the stack arguments for this call do not fit into our own save area then
8951 // the call cannot be made tail.
8952 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8953 return false;
8954
8955 const MachineRegisterInfo &MRI = MF.getRegInfo();
8956 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8957 return false;
8958
8959 return true;
8960}
8961
8962SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8963 SelectionDAG &DAG,
8964 MachineFrameInfo &MFI,
8965 int ClobberedFI) const {
8966 SmallVector<SDValue, 8> ArgChains;
8967 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8968 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8969
8970 // Include the original chain at the beginning of the list. When this is
8971 // used by target LowerCall hooks, this helps legalize find the
8972 // CALLSEQ_BEGIN node.
8973 ArgChains.push_back(Chain);
8974
8975 // Add a chain value for each stack argument corresponding
8976 for (SDNode *U : DAG.getEntryNode().getNode()->users())
8977 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8978 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8979 if (FI->getIndex() < 0) {
8980 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8981 int64_t InLastByte = InFirstByte;
8982 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8983
8984 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8985 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8986 ArgChains.push_back(SDValue(L, 1));
8987 }
8988
8989 // Build a tokenfactor for all the chains.
8990 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8991}
8992
8993bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8994 bool TailCallOpt) const {
8995 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8996 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8997}
8998
8999// Check if the value is zero-extended from i1 to i8
9000static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9001 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9002 if (SizeInBits < 8)
9003 return false;
9004
9005 APInt RequiredZero(SizeInBits, 0xFE);
9006 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9007 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9008 return ZExtBool;
9009}
9010
9011void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9012 SDNode *Node) const {
9013 // Live-in physreg copies that are glued to SMSTART are applied as
9014 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9015 // register allocator to pass call args in callee saved regs, without extra
9016 // copies to avoid these fake clobbers of actually-preserved GPRs.
9017 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9018 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9019 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9020 if (MachineOperand &MO = MI.getOperand(I);
9021 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9022 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9023 AArch64::GPR64RegClass.contains(MO.getReg())))
9024 MI.removeOperand(I);
9025
9026 // The SVE vector length can change when entering/leaving streaming mode.
9027 // FPMR is set to 0 when entering/leaving streaming mode.
9028 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9029 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9030 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9031 /*IsImplicit=*/true));
9032 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9033 /*IsImplicit=*/true));
9034 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9035 /*IsImplicit=*/true));
9036 }
9037 }
9038
9039 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9040 // have nothing to do with VG, were it not that they are used to materialise a
9041 // frame-address. If they contain a frame-index to a scalable vector, this
9042 // will likely require an ADDVL instruction to materialise the address, thus
9043 // reading VG.
9044 const MachineFunction &MF = *MI.getMF();
9046 (MI.getOpcode() == AArch64::ADDXri ||
9047 MI.getOpcode() == AArch64::SUBXri)) {
9048 const MachineOperand &MO = MI.getOperand(1);
9049 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
9051 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9052 /*IsImplicit=*/true));
9053 }
9054}
9055
9057 bool Enable, SDValue Chain,
9058 SDValue InGlue,
9059 unsigned Condition) const {
9062 FuncInfo->setHasStreamingModeChanges(true);
9063
9064 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9065 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9066 SDValue MSROp =
9067 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9068 SmallVector<SDValue> Ops = {Chain, MSROp};
9069 unsigned Opcode;
9070 if (Condition != AArch64SME::Always) {
9071 FuncInfo->setPStateSMRegUsed(true);
9072 Register PStateReg = FuncInfo->getPStateSMReg();
9073 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9074 SDValue PStateSM =
9075 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9076 // Use chain and glue from the CopyFromReg.
9077 Ops[0] = PStateSM.getValue(1);
9078 InGlue = PStateSM.getValue(2);
9079 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9080 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9081 Ops.push_back(ConditionOp);
9082 Ops.push_back(PStateSM);
9083 } else {
9084 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9085 }
9086 Ops.push_back(RegMask);
9087
9088 if (InGlue)
9089 Ops.push_back(InGlue);
9090
9091 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9092}
9093
9094// Emit a call to __arm_sme_save or __arm_sme_restore.
9096 SelectionDAG &DAG,
9098 SDValue Chain, bool IsSave) {
9101 FuncInfo->setSMESaveBufferUsed();
9103 Args.emplace_back(
9104 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
9106
9107 RTLIB::Libcall LC =
9108 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
9109 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
9110 TLI.getPointerTy(DAG.getDataLayout()));
9111 auto *RetTy = Type::getVoidTy(*DAG.getContext());
9113 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9114 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
9115 return TLI.LowerCallTo(CLI).second;
9116}
9117
9120 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9121 CallAttrs.caller().hasStreamingBody())
9122 return AArch64SME::Always;
9123 if (CallAttrs.callee().hasNonStreamingInterface())
9125 if (CallAttrs.callee().hasStreamingInterface())
9127
9128 llvm_unreachable("Unsupported attributes");
9129}
9130
9131/// Check whether a stack argument requires lowering in a tail call.
9133 const CCValAssign &VA, SDValue Arg,
9134 ISD::ArgFlagsTy Flags, int CallOffset) {
9135 // FIXME: We should be able to handle this case, but it's not clear how to.
9136 if (Flags.isZExt() || Flags.isSExt())
9137 return true;
9138
9139 for (;;) {
9140 // Look through nodes that don't alter the bits of the incoming value.
9141 unsigned Op = Arg.getOpcode();
9142 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9143 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9144 Arg = Arg.getOperand(0);
9145 continue;
9146 }
9147 break;
9148 }
9149
9150 // If the argument is a load from the same immutable stack slot, we can reuse
9151 // it.
9152 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9153 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9154 const MachineFrameInfo &MFI = MF.getFrameInfo();
9155 int FI = FINode->getIndex();
9156 if (!MFI.isImmutableObjectIndex(FI))
9157 return true;
9158 if (CallOffset != MFI.getObjectOffset(FI))
9159 return true;
9160 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9161 if (SizeInBits / 8 != MFI.getObjectSize(FI))
9162 return true;
9163 return false;
9164 }
9165 }
9166
9167 return true;
9168}
9169
9170/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9171/// and add input and output parameter nodes.
9172SDValue
9173AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9174 SmallVectorImpl<SDValue> &InVals) const {
9175 SelectionDAG &DAG = CLI.DAG;
9176 SDLoc &DL = CLI.DL;
9177 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9178 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9180 SDValue Chain = CLI.Chain;
9181 SDValue Callee = CLI.Callee;
9182 bool &IsTailCall = CLI.IsTailCall;
9183 CallingConv::ID &CallConv = CLI.CallConv;
9184 bool IsVarArg = CLI.IsVarArg;
9185 const CallBase *CB = CLI.CB;
9186
9189 bool IsThisReturn = false;
9190
9192 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9193 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9194 bool IsSibCall = false;
9195 bool GuardWithBTI = false;
9196
9197 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9198 !Subtarget->noBTIAtReturnTwice()) {
9199 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9200 }
9201
9202 // Analyze operands of the call, assigning locations to each operand.
9204 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9205
9206 if (IsVarArg) {
9207 unsigned NumArgs = Outs.size();
9208
9209 for (unsigned i = 0; i != NumArgs; ++i) {
9210 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9211 report_fatal_error("Passing SVE types to variadic functions is "
9212 "currently not supported");
9213 }
9214 }
9215
9216 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9217
9218 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9219 // Assign locations to each value returned by this call.
9221 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9222 *DAG.getContext());
9223 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9224
9225 // Set type id for call site info.
9226 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9227 CSInfo = MachineFunction::CallSiteInfo(*CB);
9228
9229 // Check callee args/returns for SVE registers and set calling convention
9230 // accordingly.
9231 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9232 auto HasSVERegLoc = [](CCValAssign &Loc) {
9233 if (!Loc.isRegLoc())
9234 return false;
9235 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9236 AArch64::PPRRegClass.contains(Loc.getLocReg());
9237 };
9238 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9240 }
9241
9242 // Determine whether we need any streaming mode changes.
9243 SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
9244 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9245 bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
9246 auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
9247 // TODO: Handle agnostic ZA functions.
9248 if (!UseNewSMEABILowering || IsAgnosticZAFunction)
9249 return std::nullopt;
9250 if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
9251 return std::nullopt;
9252 return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
9253 : AArch64ISD::INOUT_ZA_USE;
9254 }();
9255
9256 if (IsTailCall) {
9257 // Check if it's really possible to do a tail call.
9258 IsTailCall = isEligibleForTailCallOptimization(CLI);
9259
9260 // A sibling call is one where we're under the usual C ABI and not planning
9261 // to change that but can still do a tail call:
9262 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9263 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9264 IsSibCall = true;
9265
9266 if (IsTailCall)
9267 ++NumTailCalls;
9268 }
9269
9270 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9271 report_fatal_error("failed to perform tail call elimination on a call "
9272 "site marked musttail");
9273
9274 // Get a count of how many bytes are to be pushed on the stack.
9275 unsigned NumBytes = CCInfo.getStackSize();
9276
9277 if (IsSibCall) {
9278 // Since we're not changing the ABI to make this a tail call, the memory
9279 // operands are already available in the caller's incoming argument space.
9280 NumBytes = 0;
9281 }
9282
9283 // FPDiff is the byte offset of the call's argument area from the callee's.
9284 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9285 // by this amount for a tail call. In a sibling call it must be 0 because the
9286 // caller will deallocate the entire stack and the callee still expects its
9287 // arguments to begin at SP+0. Completely unused for non-tail calls.
9288 int FPDiff = 0;
9289
9290 if (IsTailCall && !IsSibCall) {
9291 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9292
9293 // Since callee will pop argument stack as a tail call, we must keep the
9294 // popped size 16-byte aligned.
9295 NumBytes = alignTo(NumBytes, 16);
9296
9297 // FPDiff will be negative if this tail call requires more space than we
9298 // would automatically have in our incoming argument space. Positive if we
9299 // can actually shrink the stack.
9300 FPDiff = NumReusableBytes - NumBytes;
9301
9302 // Update the required reserved area if this is the tail call requiring the
9303 // most argument stack space.
9304 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9305 FuncInfo->setTailCallReservedStack(-FPDiff);
9306
9307 // The stack pointer must be 16-byte aligned at all times it's used for a
9308 // memory operation, which in practice means at *all* times and in
9309 // particular across call boundaries. Therefore our own arguments started at
9310 // a 16-byte aligned SP and the delta applied for the tail call should
9311 // satisfy the same constraint.
9312 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9313 }
9314
9315 auto DescribeCallsite =
9317 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9318 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9319 R << ore::NV("Callee", ES->getSymbol());
9320 else if (CLI.CB && CLI.CB->getCalledFunction())
9321 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9322 else
9323 R << "unknown callee";
9324 R << "'";
9325 return R;
9326 };
9327
9328 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9329 bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
9330 if (RequiresLazySave) {
9331 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9332 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9333 TPIDR2.FrameIndex,
9335 Chain = DAG.getNode(
9336 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9337 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9338 TPIDR2ObjAddr);
9340 ORE.emit([&]() {
9341 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9342 CLI.CB)
9343 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9344 &MF.getFunction());
9345 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9346 });
9347 } else if (RequiresSaveAllZA) {
9348 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9349 "Cannot share state that may not exist");
9350 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9351 /*IsSave=*/true);
9352 }
9353
9354 bool RequiresSMChange = CallAttrs.requiresSMChange();
9355 if (RequiresSMChange) {
9357 ORE.emit([&]() {
9358 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9359 CLI.CB)
9360 : OptimizationRemarkAnalysis("sme", "SMETransition",
9361 &MF.getFunction());
9362 DescribeCallsite(R) << " requires a streaming mode transition";
9363 return R;
9364 });
9365 }
9366
9367 SDValue ZTFrameIdx;
9368 MachineFrameInfo &MFI = MF.getFrameInfo();
9369 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9370
9371 // If the caller has ZT0 state which will not be preserved by the callee,
9372 // spill ZT0 before the call.
9373 if (ShouldPreserveZT0) {
9374 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
9375 ZTFrameIdx = DAG.getFrameIndex(
9376 ZTObj,
9378
9379 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9380 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9381 }
9382
9383 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9384 // PSTATE.ZA before the call if there is no lazy-save active.
9385 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9386 assert((!DisableZA || !RequiresLazySave) &&
9387 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9388
9389 if (DisableZA)
9390 Chain = DAG.getNode(
9391 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9392 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9393
9394 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9395 // These operations are automatically eliminated by the prolog/epilog pass
9396 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9397 if (!IsSibCall) {
9398 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9399 if (ZAMarkerNode) {
9400 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9401 // using a chain can result in incorrect scheduling. The markers refer to
9402 // the position just before the CALLSEQ_START (though occur after as
9403 // CALLSEQ_START lacks in-glue).
9404 Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
9405 {Chain, Chain.getValue(1)});
9406 }
9407 }
9408
9409 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9411
9413 SmallSet<unsigned, 8> RegsUsed;
9414 SmallVector<SDValue, 8> MemOpChains;
9415 auto PtrVT = getPointerTy(DAG.getDataLayout());
9416
9417 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9418 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9419 for (const auto &F : Forwards) {
9420 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9421 RegsToPass.emplace_back(F.PReg, Val);
9422 }
9423 }
9424
9425 // Walk the register/memloc assignments, inserting copies/loads.
9426 unsigned ExtraArgLocs = 0;
9427 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9428 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9429 SDValue Arg = OutVals[i];
9430 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9431
9432 // Promote the value if needed.
9433 switch (VA.getLocInfo()) {
9434 default:
9435 llvm_unreachable("Unknown loc info!");
9436 case CCValAssign::Full:
9437 break;
9438 case CCValAssign::SExt:
9439 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9440 break;
9441 case CCValAssign::ZExt:
9442 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9443 break;
9444 case CCValAssign::AExt:
9445 if (Outs[i].ArgVT == MVT::i1) {
9446 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9447 //
9448 // Check if we actually have to do this, because the value may
9449 // already be zero-extended.
9450 //
9451 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9452 // and rely on DAGCombiner to fold this, because the following
9453 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9454 //
9455 // (ext (zext x)) -> (zext x)
9456 //
9457 // This will give us (zext i32), which we cannot remove, so
9458 // try to check this beforehand.
9459 if (!checkZExtBool(Arg, DAG)) {
9460 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9461 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9462 }
9463 }
9464 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9465 break;
9467 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9468 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9469 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9470 DAG.getConstant(32, DL, VA.getLocVT()));
9471 break;
9472 case CCValAssign::BCvt:
9473 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9474 break;
9475 case CCValAssign::Trunc:
9476 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9477 break;
9478 case CCValAssign::FPExt:
9479 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9480 break;
9482 bool isScalable = VA.getValVT().isScalableVT();
9483 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9484 "Indirect arguments should be scalable on most subtargets");
9485
9486 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9487 uint64_t PartSize = StoreSize;
9488 unsigned NumParts = 1;
9489 if (Outs[i].Flags.isInConsecutiveRegs()) {
9490 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9491 ++NumParts;
9492 StoreSize *= NumParts;
9493 }
9494
9495 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9496 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9497 MachineFrameInfo &MFI = MF.getFrameInfo();
9498 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9499 if (isScalable)
9501
9505 SDValue SpillSlot = Ptr;
9506
9507 // Ensure we generate all stores for each tuple part, whilst updating the
9508 // pointer after each store correctly using vscale.
9509 while (NumParts) {
9510 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9511 MemOpChains.push_back(Store);
9512
9513 NumParts--;
9514 if (NumParts > 0) {
9515 SDValue BytesIncrement;
9516 if (isScalable) {
9517 BytesIncrement = DAG.getVScale(
9518 DL, Ptr.getValueType(),
9519 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9520 } else {
9521 BytesIncrement = DAG.getConstant(
9522 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9523 Ptr.getValueType());
9524 }
9525 MPI = MachinePointerInfo(MPI.getAddrSpace());
9526 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9527 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9528 ExtraArgLocs++;
9529 i++;
9530 }
9531 }
9532
9533 Arg = SpillSlot;
9534 break;
9535 }
9536
9537 if (VA.isRegLoc()) {
9538 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9539 Outs[0].VT == MVT::i64) {
9540 assert(VA.getLocVT() == MVT::i64 &&
9541 "unexpected calling convention register assignment");
9542 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9543 "unexpected use of 'returned'");
9544 IsThisReturn = true;
9545 }
9546 if (RegsUsed.count(VA.getLocReg())) {
9547 // If this register has already been used then we're trying to pack
9548 // parts of an [N x i32] into an X-register. The extension type will
9549 // take care of putting the two halves in the right place but we have to
9550 // combine them.
9551 SDValue &Bits =
9552 llvm::find_if(RegsToPass,
9553 [=](const std::pair<unsigned, SDValue> &Elt) {
9554 return Elt.first == VA.getLocReg();
9555 })
9556 ->second;
9557 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9558 // Call site info is used for function's parameter entry value
9559 // tracking. For now we track only simple cases when parameter
9560 // is transferred through whole register.
9562 [&VA](MachineFunction::ArgRegPair ArgReg) {
9563 return ArgReg.Reg == VA.getLocReg();
9564 });
9565 } else {
9566 // Add an extra level of indirection for streaming mode changes by
9567 // using a pseudo copy node that cannot be rematerialised between a
9568 // smstart/smstop and the call by the simple register coalescer.
9569 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9570 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9571 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
9572 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9573 RegsUsed.insert(VA.getLocReg());
9574 const TargetOptions &Options = DAG.getTarget().Options;
9575 if (Options.EmitCallSiteInfo)
9576 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9577 }
9578 } else {
9579 assert(VA.isMemLoc());
9580
9581 SDValue DstAddr;
9582 MachinePointerInfo DstInfo;
9583
9584 // FIXME: This works on big-endian for composite byvals, which are the
9585 // common case. It should also work for fundamental types too.
9586 uint32_t BEAlign = 0;
9587 unsigned OpSize;
9588 if (VA.getLocInfo() == CCValAssign::Indirect ||
9590 OpSize = VA.getLocVT().getFixedSizeInBits();
9591 else
9592 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9593 : VA.getValVT().getSizeInBits();
9594 OpSize = (OpSize + 7) / 8;
9595 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9596 !Flags.isInConsecutiveRegs()) {
9597 if (OpSize < 8)
9598 BEAlign = 8 - OpSize;
9599 }
9600 unsigned LocMemOffset = VA.getLocMemOffset();
9601 int32_t Offset = LocMemOffset + BEAlign;
9602
9603 if (IsTailCall) {
9604 // When the frame pointer is perfectly aligned for the tail call and the
9605 // same stack argument is passed down intact, we can reuse it.
9606 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
9607 continue;
9608
9609 Offset = Offset + FPDiff;
9610 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9611
9612 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9613 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9614
9615 // Make sure any stack arguments overlapping with where we're storing
9616 // are loaded before this eventual operation. Otherwise they'll be
9617 // clobbered.
9618 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9619 } else {
9620 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9621
9622 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9623 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9624 }
9625
9626 if (Outs[i].Flags.isByVal()) {
9627 SDValue SizeNode =
9628 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9629 SDValue Cpy = DAG.getMemcpy(
9630 Chain, DL, DstAddr, Arg, SizeNode,
9631 Outs[i].Flags.getNonZeroByValAlign(),
9632 /*isVol = */ false, /*AlwaysInline = */ false,
9633 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9634
9635 MemOpChains.push_back(Cpy);
9636 } else {
9637 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9638 // promoted to a legal register type i32, we should truncate Arg back to
9639 // i1/i8/i16.
9640 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9641 VA.getValVT() == MVT::i16)
9642 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9643
9644 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9645 MemOpChains.push_back(Store);
9646 }
9647 }
9648 }
9649
9650 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9651 !(CLI.CB && CLI.CB->isMustTailCall())) {
9652 SDValue ParamPtr = StackPtr;
9653 if (IsTailCall) {
9654 // Create a dummy object at the top of the stack that can be used to get
9655 // the SP after the epilogue
9656 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9657 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9658 }
9659
9660 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9661 // describing the argument list. x4 contains the address of the
9662 // first stack parameter. x5 contains the size in bytes of all parameters
9663 // passed on the stack.
9664 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9665 RegsToPass.emplace_back(AArch64::X5,
9666 DAG.getConstant(NumBytes, DL, MVT::i64));
9667 }
9668
9669 if (!MemOpChains.empty())
9670 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9671
9672 SDValue InGlue;
9673 if (RequiresSMChange) {
9674 Chain =
9676 Chain, InGlue, getSMToggleCondition(CallAttrs));
9677 InGlue = Chain.getValue(1);
9678 }
9679
9680 // Build a sequence of copy-to-reg nodes chained together with token chain
9681 // and flag operands which copy the outgoing args into the appropriate regs.
9682 for (auto &RegToPass : RegsToPass) {
9683 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9684 RegToPass.second, InGlue);
9685 InGlue = Chain.getValue(1);
9686 }
9687
9688 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9689 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9690 // node so that legalize doesn't hack it.
9691 const GlobalValue *CalledGlobal = nullptr;
9692 unsigned OpFlags = 0;
9693 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9694 CalledGlobal = G->getGlobal();
9695 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9697 if (OpFlags & AArch64II::MO_GOT) {
9698 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9699 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9700 } else {
9701 const GlobalValue *GV = G->getGlobal();
9702 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9703 }
9704 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9705 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9706 Subtarget->isTargetMachO()) ||
9708 const char *Sym = S->getSymbol();
9709 if (UseGot) {
9711 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9712 } else {
9713 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9714 }
9715 }
9716
9717 // We don't usually want to end the call-sequence here because we would tidy
9718 // the frame up *after* the call, however in the ABI-changing tail-call case
9719 // we've carefully laid out the parameters so that when sp is reset they'll be
9720 // in the correct location.
9721 if (IsTailCall && !IsSibCall) {
9722 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9723 InGlue = Chain.getValue(1);
9724 }
9725
9726 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9727
9728 std::vector<SDValue> Ops;
9729 Ops.push_back(Chain);
9730 Ops.push_back(Callee);
9731
9732 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9733 // be expanded to the call, directly followed by a special marker sequence and
9734 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9735 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9736 assert(!IsTailCall &&
9737 "tail calls cannot be marked with clang.arc.attachedcall");
9738 Opc = AArch64ISD::CALL_RVMARKER;
9739
9740 // Add a target global address for the retainRV/claimRV runtime function
9741 // just before the call target.
9742 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9743 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9744 Ops.insert(Ops.begin() + 1, GA);
9745
9746 // We may or may not need to emit both the marker and the retain/claim call.
9747 // Tell the pseudo expansion using an additional boolean op.
9748 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
9749 SDValue DoEmitMarker =
9750 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
9751 Ops.insert(Ops.begin() + 2, DoEmitMarker);
9752 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9753 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9754 } else if (GuardWithBTI) {
9755 Opc = AArch64ISD::CALL_BTI;
9756 }
9757
9758 if (IsTailCall) {
9759 // Each tail call may have to adjust the stack by a different amount, so
9760 // this information must travel along with the operation for eventual
9761 // consumption by emitEpilogue.
9762 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9763 }
9764
9765 if (CLI.PAI) {
9766 const uint64_t Key = CLI.PAI->Key;
9767 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9768 "Invalid auth call key");
9769
9770 // Split the discriminator into address/integer components.
9771 SDValue AddrDisc, IntDisc;
9772 std::tie(IntDisc, AddrDisc) =
9773 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9774
9775 if (Opc == AArch64ISD::CALL_RVMARKER)
9776 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9777 else
9778 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9779 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9780 Ops.push_back(IntDisc);
9781 Ops.push_back(AddrDisc);
9782 }
9783
9784 // Add argument registers to the end of the list so that they are known live
9785 // into the call.
9786 for (auto &RegToPass : RegsToPass)
9787 Ops.push_back(DAG.getRegister(RegToPass.first,
9788 RegToPass.second.getValueType()));
9789
9790 // Add a register mask operand representing the call-preserved registers.
9791 const uint32_t *Mask;
9792 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9793 if (IsThisReturn) {
9794 // For 'this' returns, use the X0-preserving mask if applicable
9795 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9796 if (!Mask) {
9797 IsThisReturn = false;
9798 Mask = TRI->getCallPreservedMask(MF, CallConv);
9799 }
9800 } else
9801 Mask = TRI->getCallPreservedMask(MF, CallConv);
9802
9803 if (Subtarget->hasCustomCallingConv())
9804 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9805
9806 if (TRI->isAnyArgRegReserved(MF))
9807 TRI->emitReservedArgRegCallError(MF);
9808
9809 assert(Mask && "Missing call preserved mask for calling convention");
9810 Ops.push_back(DAG.getRegisterMask(Mask));
9811
9812 if (InGlue.getNode())
9813 Ops.push_back(InGlue);
9814
9815 // If we're doing a tall call, use a TC_RETURN here rather than an
9816 // actual call instruction.
9817 if (IsTailCall) {
9819 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9820 if (IsCFICall)
9821 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9822
9823 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9824 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9825 if (CalledGlobal &&
9826 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9827 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9828 return Ret;
9829 }
9830
9831 // Returns a chain and a flag for retval copy to use.
9832 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9833 if (IsCFICall)
9834 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9835
9836 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9837 InGlue = Chain.getValue(1);
9838 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9839 if (CalledGlobal &&
9840 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9841 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9842
9843 uint64_t CalleePopBytes =
9844 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9845
9846 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9847 InGlue = Chain.getValue(1);
9848
9849 // Handle result values, copying them out of physregs into vregs that we
9850 // return.
9851 SDValue Result = LowerCallResult(
9852 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9853 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9854
9855 if (!Ins.empty())
9856 InGlue = Result.getValue(Result->getNumValues() - 1);
9857
9858 if (RequiresSMChange) {
9860 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
9861 getSMToggleCondition(CallAttrs));
9862 }
9863
9864 if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
9865 // Unconditionally resume ZA.
9866 Result = DAG.getNode(
9867 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
9868 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9869
9870 if (ShouldPreserveZT0)
9871 Result =
9872 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
9873 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9874
9875 if (RequiresLazySave) {
9876 // Conditionally restore the lazy save using a pseudo node.
9877 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
9878 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9879 SDValue RegMask = DAG.getRegisterMask(
9880 TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
9881 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9883 SDValue TPIDR2_EL0 = DAG.getNode(
9884 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
9885 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
9886 // Copy the address of the TPIDR2 block into X0 before 'calling' the
9887 // RESTORE_ZA pseudo.
9888 SDValue Glue;
9889 SDValue TPIDR2Block = DAG.getFrameIndex(
9890 TPIDR2.FrameIndex,
9892 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
9893 Result =
9894 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
9895 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
9896 RestoreRoutine, RegMask, Result.getValue(1)});
9897 // Finally reset the TPIDR2_EL0 register to 0.
9898 Result = DAG.getNode(
9899 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
9900 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9901 DAG.getConstant(0, DL, MVT::i64));
9902 TPIDR2.Uses++;
9903 } else if (RequiresSaveAllZA) {
9904 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
9905 /*IsSave=*/false);
9906 }
9907
9908 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9909 RequiresSaveAllZA) {
9910 for (unsigned I = 0; I < InVals.size(); ++I) {
9911 // The smstart/smstop is chained as part of the call, but when the
9912 // resulting chain is discarded (which happens when the call is not part
9913 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9914 // smstart/smstop is chained to the result value. We can do that by doing
9915 // a vreg -> vreg copy.
9918 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
9919 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
9920 InVals[I].getValueType());
9921 }
9922 }
9923
9924 if (CallConv == CallingConv::PreserveNone) {
9925 for (const ISD::OutputArg &O : Outs) {
9926 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9927 O.Flags.isSwiftAsync()) {
9930 MF.getFunction(),
9931 "Swift attributes can't be used with preserve_none",
9932 DL.getDebugLoc()));
9933 break;
9934 }
9935 }
9936 }
9937
9938 return Result;
9939}
9940
9941bool AArch64TargetLowering::CanLowerReturn(
9942 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9943 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
9944 const Type *RetTy) const {
9945 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9947 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9948 return CCInfo.CheckReturn(Outs, RetCC);
9949}
9950
9951SDValue
9952AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9953 bool isVarArg,
9955 const SmallVectorImpl<SDValue> &OutVals,
9956 const SDLoc &DL, SelectionDAG &DAG) const {
9957 auto &MF = DAG.getMachineFunction();
9958 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9959
9960 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9962 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9963 CCInfo.AnalyzeReturn(Outs, RetCC);
9964
9965 // Copy the result values into the output registers.
9966 SDValue Glue;
9968 SmallSet<unsigned, 4> RegsUsed;
9969 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9970 ++i, ++realRVLocIdx) {
9971 CCValAssign &VA = RVLocs[i];
9972 assert(VA.isRegLoc() && "Can only return in registers!");
9973 SDValue Arg = OutVals[realRVLocIdx];
9974
9975 switch (VA.getLocInfo()) {
9976 default:
9977 llvm_unreachable("Unknown loc info!");
9978 case CCValAssign::Full:
9979 if (Outs[i].ArgVT == MVT::i1) {
9980 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9981 // value. This is strictly redundant on Darwin (which uses "zeroext
9982 // i1"), but will be optimised out before ISel.
9983 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9984 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9985 }
9986 break;
9987 case CCValAssign::BCvt:
9988 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9989 break;
9990 case CCValAssign::AExt:
9991 case CCValAssign::ZExt:
9992 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9993 break;
9995 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9996 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9997 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9998 DAG.getConstant(32, DL, VA.getLocVT()));
9999 break;
10000 }
10001
10002 if (RegsUsed.count(VA.getLocReg())) {
10003 SDValue &Bits =
10004 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10005 return Elt.first == VA.getLocReg();
10006 })->second;
10007 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10008 } else {
10009 RetVals.emplace_back(VA.getLocReg(), Arg);
10010 RegsUsed.insert(VA.getLocReg());
10011 }
10012 }
10013
10014 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10015
10016 // Emit SMSTOP before returning from a locally streaming function
10017 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10018 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10019 if (FuncAttrs.hasStreamingCompatibleInterface())
10020 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10021 /*Glue*/ SDValue(),
10023 else
10024 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10025 /*Glue*/ SDValue(), AArch64SME::Always);
10026 Glue = Chain.getValue(1);
10027 }
10028
10029 SmallVector<SDValue, 4> RetOps(1, Chain);
10030 for (auto &RetVal : RetVals) {
10031 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10032 isPassedInFPR(RetVal.second.getValueType()))
10033 RetVal.second =
10034 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10035 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10036 RetVal.second);
10037 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10038 Glue = Chain.getValue(1);
10039 RetOps.push_back(
10040 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10041 }
10042
10043 // Windows AArch64 ABIs require that for returning structs by value we copy
10044 // the sret argument into X0 for the return.
10045 // We saved the argument into a virtual register in the entry block,
10046 // so now we copy the value out and into X0.
10047 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10048 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10050
10051 unsigned RetValReg = AArch64::X0;
10052 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10053 RetValReg = AArch64::X8;
10054 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10055 Glue = Chain.getValue(1);
10056
10057 RetOps.push_back(
10058 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10059 }
10060
10061 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10062 if (I) {
10063 for (; *I; ++I) {
10064 if (AArch64::GPR64RegClass.contains(*I))
10065 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10066 else if (AArch64::FPR64RegClass.contains(*I))
10067 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10068 else
10069 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10070 }
10071 }
10072
10073 RetOps[0] = Chain; // Update chain.
10074
10075 // Add the glue if we have it.
10076 if (Glue.getNode())
10077 RetOps.push_back(Glue);
10078
10079 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10080 // ARM64EC entry thunks use a special return sequence: instead of a regular
10081 // "ret" instruction, they need to explicitly call the emulator.
10082 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10083 SDValue Arm64ECRetDest =
10084 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10085 Arm64ECRetDest =
10086 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10087 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10089 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10090 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10091 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10092 }
10093
10094 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10095}
10096
10097//===----------------------------------------------------------------------===//
10098// Other Lowering Code
10099//===----------------------------------------------------------------------===//
10100
10101SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10102 SelectionDAG &DAG,
10103 unsigned Flag) const {
10104 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10105 N->getOffset(), Flag);
10106}
10107
10108SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10109 SelectionDAG &DAG,
10110 unsigned Flag) const {
10111 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10112}
10113
10114SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10115 SelectionDAG &DAG,
10116 unsigned Flag) const {
10117 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10118 N->getOffset(), Flag);
10119}
10120
10121SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10122 SelectionDAG &DAG,
10123 unsigned Flag) const {
10124 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10125}
10126
10127SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10128 SelectionDAG &DAG,
10129 unsigned Flag) const {
10130 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10131}
10132
10133// (loadGOT sym)
10134template <class NodeTy>
10135SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10136 unsigned Flags) const {
10137 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10138 SDLoc DL(N);
10139 EVT Ty = getPointerTy(DAG.getDataLayout());
10140 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10141 // FIXME: Once remat is capable of dealing with instructions with register
10142 // operands, expand this into two nodes instead of using a wrapper node.
10143 if (DAG.getMachineFunction()
10145 ->hasELFSignedGOT())
10146 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10147 0);
10148 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10149}
10150
10151// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10152template <class NodeTy>
10153SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10154 unsigned Flags) const {
10155 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10156 SDLoc DL(N);
10157 EVT Ty = getPointerTy(DAG.getDataLayout());
10158 const unsigned char MO_NC = AArch64II::MO_NC;
10159 return DAG.getNode(
10160 AArch64ISD::WrapperLarge, DL, Ty,
10161 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10162 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10163 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10164 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10165}
10166
10167// (addlow (adrp %hi(sym)) %lo(sym))
10168template <class NodeTy>
10169SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10170 unsigned Flags) const {
10171 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10172 SDLoc DL(N);
10173 EVT Ty = getPointerTy(DAG.getDataLayout());
10174 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10175 SDValue Lo = getTargetNode(N, Ty, DAG,
10177 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10178 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10179}
10180
10181// (adr sym)
10182template <class NodeTy>
10183SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10184 unsigned Flags) const {
10185 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10186 SDLoc DL(N);
10187 EVT Ty = getPointerTy(DAG.getDataLayout());
10188 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10189 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10190}
10191
10192SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10193 SelectionDAG &DAG) const {
10194 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10195 const GlobalValue *GV = GN->getGlobal();
10196 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10197
10198 if (OpFlags != AArch64II::MO_NO_FLAG)
10199 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
10200 "unexpected offset in global node");
10201
10202 // This also catches the large code model case for Darwin, and tiny code
10203 // model with got relocations.
10204 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10205 return getGOT(GN, DAG, OpFlags);
10206 }
10207
10211 Result = getAddrLarge(GN, DAG, OpFlags);
10212 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10213 Result = getAddrTiny(GN, DAG, OpFlags);
10214 } else {
10215 Result = getAddr(GN, DAG, OpFlags);
10216 }
10217 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10218 SDLoc DL(GN);
10220 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10222 return Result;
10223}
10224
10225/// Convert a TLS address reference into the correct sequence of loads
10226/// and calls to compute the variable's address (for Darwin, currently) and
10227/// return an SDValue containing the final node.
10228
10229/// Darwin only has one TLS scheme which must be capable of dealing with the
10230/// fully general situation, in the worst case. This means:
10231/// + "extern __thread" declaration.
10232/// + Defined in a possibly unknown dynamic library.
10233///
10234/// The general system is that each __thread variable has a [3 x i64] descriptor
10235/// which contains information used by the runtime to calculate the address. The
10236/// only part of this the compiler needs to know about is the first xword, which
10237/// contains a function pointer that must be called with the address of the
10238/// entire descriptor in "x0".
10239///
10240/// Since this descriptor may be in a different unit, in general even the
10241/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10242/// is:
10243/// adrp x0, _var@TLVPPAGE
10244/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10245/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10246/// ; the function pointer
10247/// blr x1 ; Uses descriptor address in x0
10248/// ; Address of _var is now in x0.
10249///
10250/// If the address of _var's descriptor *is* known to the linker, then it can
10251/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10252/// a slight efficiency gain.
10253SDValue
10254AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10255 SelectionDAG &DAG) const {
10256 assert(Subtarget->isTargetDarwin() &&
10257 "This function expects a Darwin target");
10258
10259 SDLoc DL(Op);
10260 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10261 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10262 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10263
10264 SDValue TLVPAddr =
10265 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10266 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10267
10268 // The first entry in the descriptor is a function pointer that we must call
10269 // to obtain the address of the variable.
10270 SDValue Chain = DAG.getEntryNode();
10271 SDValue FuncTLVGet = DAG.getLoad(
10272 PtrMemVT, DL, Chain, DescAddr,
10274 Align(PtrMemVT.getSizeInBits() / 8),
10276 Chain = FuncTLVGet.getValue(1);
10277
10278 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10279 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10280
10282 MFI.setAdjustsStack(true);
10283
10284 // TLS calls preserve all registers except those that absolutely must be
10285 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10286 // silly).
10287 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10288 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10289 if (Subtarget->hasCustomCallingConv())
10290 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10291
10292 // Finally, we can make the call. This is just a degenerate version of a
10293 // normal AArch64 call node: x0 takes the address of the descriptor, and
10294 // returns the address of the variable in this thread.
10295 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10296
10297 unsigned Opcode = AArch64ISD::CALL;
10299 Ops.push_back(Chain);
10300 Ops.push_back(FuncTLVGet);
10301
10302 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10303 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10304 Opcode = AArch64ISD::AUTH_CALL;
10305 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10306 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10307 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10308 }
10309
10310 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10311 Ops.push_back(DAG.getRegisterMask(Mask));
10312 Ops.push_back(Chain.getValue(1));
10313 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10314 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10315}
10316
10317/// Convert a thread-local variable reference into a sequence of instructions to
10318/// compute the variable's address for the local exec TLS model of ELF targets.
10319/// The sequence depends on the maximum TLS area size.
10320SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10321 SDValue ThreadBase,
10322 const SDLoc &DL,
10323 SelectionDAG &DAG) const {
10324 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10325 SDValue TPOff, Addr;
10326
10327 switch (DAG.getTarget().Options.TLSSize) {
10328 default:
10329 llvm_unreachable("Unexpected TLS size");
10330
10331 case 12: {
10332 // mrs x0, TPIDR_EL0
10333 // add x0, x0, :tprel_lo12:a
10335 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10336 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10337 Var,
10338 DAG.getTargetConstant(0, DL, MVT::i32)),
10339 0);
10340 }
10341
10342 case 24: {
10343 // mrs x0, TPIDR_EL0
10344 // add x0, x0, :tprel_hi12:a
10345 // add x0, x0, :tprel_lo12_nc:a
10346 SDValue HiVar = DAG.getTargetGlobalAddress(
10347 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10348 SDValue LoVar = DAG.getTargetGlobalAddress(
10349 GV, DL, PtrVT, 0,
10351 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10352 HiVar,
10353 DAG.getTargetConstant(0, DL, MVT::i32)),
10354 0);
10355 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10356 LoVar,
10357 DAG.getTargetConstant(0, DL, MVT::i32)),
10358 0);
10359 }
10360
10361 case 32: {
10362 // mrs x1, TPIDR_EL0
10363 // movz x0, #:tprel_g1:a
10364 // movk x0, #:tprel_g0_nc:a
10365 // add x0, x1, x0
10366 SDValue HiVar = DAG.getTargetGlobalAddress(
10367 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10368 SDValue LoVar = DAG.getTargetGlobalAddress(
10369 GV, DL, PtrVT, 0,
10371 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10372 DAG.getTargetConstant(16, DL, MVT::i32)),
10373 0);
10374 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10375 DAG.getTargetConstant(0, DL, MVT::i32)),
10376 0);
10377 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10378 }
10379
10380 case 48: {
10381 // mrs x1, TPIDR_EL0
10382 // movz x0, #:tprel_g2:a
10383 // movk x0, #:tprel_g1_nc:a
10384 // movk x0, #:tprel_g0_nc:a
10385 // add x0, x1, x0
10386 SDValue HiVar = DAG.getTargetGlobalAddress(
10387 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10388 SDValue MiVar = DAG.getTargetGlobalAddress(
10389 GV, DL, PtrVT, 0,
10391 SDValue LoVar = DAG.getTargetGlobalAddress(
10392 GV, DL, PtrVT, 0,
10394 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10395 DAG.getTargetConstant(32, DL, MVT::i32)),
10396 0);
10397 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10398 DAG.getTargetConstant(16, DL, MVT::i32)),
10399 0);
10400 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10401 DAG.getTargetConstant(0, DL, MVT::i32)),
10402 0);
10403 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10404 }
10405 }
10406}
10407
10408/// When accessing thread-local variables under either the general-dynamic or
10409/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10410/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10411/// is a function pointer to carry out the resolution.
10412///
10413/// The sequence is:
10414/// adrp x0, :tlsdesc:var
10415/// ldr x1, [x0, #:tlsdesc_lo12:var]
10416/// add x0, x0, #:tlsdesc_lo12:var
10417/// .tlsdesccall var
10418/// blr x1
10419/// (TPIDR_EL0 offset now in x0)
10420///
10421/// The above sequence must be produced unscheduled, to enable the linker to
10422/// optimize/relax this sequence.
10423/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10424/// above sequence, and expanded really late in the compilation flow, to ensure
10425/// the sequence is produced as per above.
10426SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10427 const SDLoc &DL,
10428 SelectionDAG &DAG) const {
10429 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10430
10431 SDValue Chain = DAG.getEntryNode();
10432 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10433
10434 unsigned Opcode =
10435 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10436 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10437 : AArch64ISD::TLSDESC_CALLSEQ;
10438 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10439 SDValue Glue = Chain.getValue(1);
10440
10441 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10442}
10443
10444SDValue
10445AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10446 SelectionDAG &DAG) const {
10447 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10448
10449 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10450 AArch64FunctionInfo *MFI =
10452
10456
10458 if (Model == TLSModel::LocalDynamic)
10460 }
10461
10463 Model != TLSModel::LocalExec)
10464 report_fatal_error("ELF TLS only supported in small memory model or "
10465 "in local exec TLS model");
10466 // Different choices can be made for the maximum size of the TLS area for a
10467 // module. For the small address model, the default TLS size is 16MiB and the
10468 // maximum TLS size is 4GiB.
10469 // FIXME: add tiny and large code model support for TLS access models other
10470 // than local exec. We currently generate the same code as small for tiny,
10471 // which may be larger than needed.
10472
10473 SDValue TPOff;
10474 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10475 SDLoc DL(Op);
10476 const GlobalValue *GV = GA->getGlobal();
10477
10478 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10479
10480 if (Model == TLSModel::LocalExec) {
10481 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10482 } else if (Model == TLSModel::InitialExec) {
10483 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10484 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10485 } else if (Model == TLSModel::LocalDynamic) {
10486 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10487 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10488 // the beginning of the module's TLS region, followed by a DTPREL offset
10489 // calculation.
10490
10491 // These accesses will need deduplicating if there's more than one.
10493
10494 // The call needs a relocation too for linker relaxation. It doesn't make
10495 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10496 // the address.
10497 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10499
10500 // Now we can calculate the offset from TPIDR_EL0 to this module's
10501 // thread-local area.
10502 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10503
10504 // Now use :dtprel_whatever: operations to calculate this variable's offset
10505 // in its thread-storage area.
10506 SDValue HiVar = DAG.getTargetGlobalAddress(
10507 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10508 SDValue LoVar = DAG.getTargetGlobalAddress(
10509 GV, DL, MVT::i64, 0,
10511
10512 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10513 DAG.getTargetConstant(0, DL, MVT::i32)),
10514 0);
10515 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10516 DAG.getTargetConstant(0, DL, MVT::i32)),
10517 0);
10518 } else if (Model == TLSModel::GeneralDynamic) {
10519 // The call needs a relocation too for linker relaxation. It doesn't make
10520 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10521 // the address.
10522 SDValue SymAddr =
10523 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10524
10525 // Finally we can make a call to calculate the offset from tpidr_el0.
10526 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10527 } else
10528 llvm_unreachable("Unsupported ELF TLS access model");
10529
10530 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10531}
10532
10533SDValue
10534AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10535 SelectionDAG &DAG) const {
10536 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10537
10538 SDValue Chain = DAG.getEntryNode();
10539 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10540 SDLoc DL(Op);
10541
10542 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10543
10544 // Load the ThreadLocalStoragePointer from the TEB
10545 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10546 SDValue TLSArray =
10547 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10548 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10549 Chain = TLSArray.getValue(1);
10550
10551 // Load the TLS index from the C runtime;
10552 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10553 // This also does the same as LOADgot, but using a generic i32 load,
10554 // while LOADgot only loads i64.
10555 SDValue TLSIndexHi =
10556 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10557 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10558 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10559 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10560 SDValue TLSIndex =
10561 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10562 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10563 Chain = TLSIndex.getValue(1);
10564
10565 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10566 // offset into the TLSArray.
10567 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10568 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10569 DAG.getConstant(3, DL, PtrVT));
10570 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10571 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10573 Chain = TLS.getValue(1);
10574
10575 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10576 const GlobalValue *GV = GA->getGlobal();
10577 SDValue TGAHi = DAG.getTargetGlobalAddress(
10578 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10579 SDValue TGALo = DAG.getTargetGlobalAddress(
10580 GV, DL, PtrVT, 0,
10582
10583 // Add the offset from the start of the .tls section (section base).
10584 SDValue Addr =
10585 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10586 DAG.getTargetConstant(0, DL, MVT::i32)),
10587 0);
10588 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10589 return Addr;
10590}
10591
10592SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10593 SelectionDAG &DAG) const {
10594 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10595 if (DAG.getTarget().useEmulatedTLS())
10596 return LowerToTLSEmulatedModel(GA, DAG);
10597
10598 if (Subtarget->isTargetDarwin())
10599 return LowerDarwinGlobalTLSAddress(Op, DAG);
10600 if (Subtarget->isTargetELF())
10601 return LowerELFGlobalTLSAddress(Op, DAG);
10602 if (Subtarget->isTargetWindows())
10603 return LowerWindowsGlobalTLSAddress(Op, DAG);
10604
10605 llvm_unreachable("Unexpected platform trying to use TLS");
10606}
10607
10608//===----------------------------------------------------------------------===//
10609// PtrAuthGlobalAddress lowering
10610//
10611// We have 3 lowering alternatives to choose from:
10612// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10613// If the GV doesn't need a GOT load (i.e., is locally defined)
10614// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10615//
10616// - LOADgotPAC: similar to LOADgot, with added PAC.
10617// If the GV needs a GOT load, materialize the pointer using the usual
10618// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10619// section is assumed to be read-only (for example, via relro mechanism). See
10620// LowerMOVaddrPAC.
10621//
10622// - LOADauthptrstatic: similar to LOADgot, but use a
10623// special stub slot instead of a GOT slot.
10624// Load a signed pointer for symbol 'sym' from a stub slot named
10625// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10626// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10627// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10628//
10629// All 3 are pseudos that are expand late to longer sequences: this lets us
10630// provide integrity guarantees on the to-be-signed intermediate values.
10631//
10632// LOADauthptrstatic is undesirable because it requires a large section filled
10633// with often similarly-signed pointers, making it a good harvesting target.
10634// Thus, it's only used for ptrauth references to extern_weak to avoid null
10635// checks.
10636
10638 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10639 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10640 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10641 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10642
10643 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10644 // offset alone as a pointer if the symbol wasn't available, which would
10645 // probably break null checks in users. Ptrauth complicates things further:
10646 // error out.
10647 if (TGN->getOffset() != 0)
10649 "unsupported non-zero offset in weak ptrauth global reference");
10650
10651 if (!isNullConstant(AddrDiscriminator))
10652 report_fatal_error("unsupported weak addr-div ptrauth global");
10653
10654 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10655 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10656 {TGA, Key, Discriminator}),
10657 0);
10658}
10659
10660SDValue
10661AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10662 SelectionDAG &DAG) const {
10663 SDValue Ptr = Op.getOperand(0);
10664 uint64_t KeyC = Op.getConstantOperandVal(1);
10665 SDValue AddrDiscriminator = Op.getOperand(2);
10666 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10667 EVT VT = Op.getValueType();
10668 SDLoc DL(Op);
10669
10670 if (KeyC > AArch64PACKey::LAST)
10671 report_fatal_error("key in ptrauth global out of range [0, " +
10672 Twine((int)AArch64PACKey::LAST) + "]");
10673
10674 // Blend only works if the integer discriminator is 16-bit wide.
10675 if (!isUInt<16>(DiscriminatorC))
10677 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10678
10679 // Choosing between 3 lowering alternatives is target-specific.
10680 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10681 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10682
10683 int64_t PtrOffsetC = 0;
10684 if (Ptr.getOpcode() == ISD::ADD) {
10685 PtrOffsetC = Ptr.getConstantOperandVal(1);
10686 Ptr = Ptr.getOperand(0);
10687 }
10688 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10689 const GlobalValue *PtrGV = PtrN->getGlobal();
10690
10691 // Classify the reference to determine whether it needs a GOT load.
10692 const unsigned OpFlags =
10693 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10694 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10695 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10696 "unsupported non-GOT op flags on ptrauth global reference");
10697
10698 // Fold any offset into the GV; our pseudos expect it there.
10699 PtrOffsetC += PtrN->getOffset();
10700 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10701 /*TargetFlags=*/0);
10702 assert(PtrN->getTargetFlags() == 0 &&
10703 "unsupported target flags on ptrauth global");
10704
10705 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10706 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10707 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10708 ? AddrDiscriminator
10709 : DAG.getRegister(AArch64::XZR, MVT::i64);
10710
10711 // No GOT load needed -> MOVaddrPAC
10712 if (!NeedsGOTLoad) {
10713 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10714 return SDValue(
10715 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10716 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10717 0);
10718 }
10719
10720 // GOT load -> LOADgotPAC
10721 // Note that we disallow extern_weak refs to avoid null checks later.
10722 if (!PtrGV->hasExternalWeakLinkage())
10723 return SDValue(
10724 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10725 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10726 0);
10727
10728 // extern_weak ref -> LOADauthptrstatic
10730 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10731 DAG);
10732}
10733
10734// Looks through \param Val to determine the bit that can be used to
10735// check the sign of the value. It returns the unextended value and
10736// the sign bit position.
10737std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10738 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10739 return {Val.getOperand(0),
10740 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10741 1};
10742
10743 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10744 return {Val.getOperand(0),
10745 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10746
10747 return {Val, Val.getValueSizeInBits() - 1};
10748}
10749
10750SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10751 SDValue Chain = Op.getOperand(0);
10752 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10753 SDValue LHS = Op.getOperand(2);
10754 SDValue RHS = Op.getOperand(3);
10755 SDValue Dest = Op.getOperand(4);
10756 SDLoc DL(Op);
10757
10759 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10760 // will not be produced, as they are conditional branch instructions that do
10761 // not set flags.
10762 bool ProduceNonFlagSettingCondBr =
10763 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10764
10765 // Handle f128 first, since lowering it will result in comparing the return
10766 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10767 // is expecting to deal with.
10768 if (LHS.getValueType() == MVT::f128) {
10769 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
10770
10771 // If softenSetCCOperands returned a scalar, we need to compare the result
10772 // against zero to select between true and false values.
10773 if (!RHS.getNode()) {
10774 RHS = DAG.getConstant(0, DL, LHS.getValueType());
10775 CC = ISD::SETNE;
10776 }
10777 }
10778
10779 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10780 // instruction.
10781 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
10782 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10783 // Only lower legal XALUO ops.
10784 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10785 return SDValue();
10786
10787 // The actual operation with overflow check.
10789 SDValue Value, Overflow;
10790 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10791
10792 if (CC == ISD::SETNE)
10793 OFCC = getInvertedCondCode(OFCC);
10794 SDValue CCVal = getCondCode(DAG, OFCC);
10795
10796 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10797 Overflow);
10798 }
10799
10800 if (LHS.getValueType().isInteger()) {
10801 assert((LHS.getValueType() == RHS.getValueType()) &&
10802 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10803
10804 // If the RHS of the comparison is zero, we can potentially fold this
10805 // to a specialized branch.
10806 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10807 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10808 if (CC == ISD::SETEQ) {
10809 // See if we can use a TBZ to fold in an AND as well.
10810 // TBZ has a smaller branch displacement than CBZ. If the offset is
10811 // out of bounds, a late MI-layer pass rewrites branches.
10812 // 403.gcc is an example that hits this case.
10813 if (LHS.getOpcode() == ISD::AND &&
10814 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10815 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10816 SDValue Test = LHS.getOperand(0);
10817 uint64_t Mask = LHS.getConstantOperandVal(1);
10818 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
10819 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10820 Dest);
10821 }
10822
10823 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
10824 } else if (CC == ISD::SETNE) {
10825 // See if we can use a TBZ to fold in an AND as well.
10826 // TBZ has a smaller branch displacement than CBZ. If the offset is
10827 // out of bounds, a late MI-layer pass rewrites branches.
10828 // 403.gcc is an example that hits this case.
10829 if (LHS.getOpcode() == ISD::AND &&
10830 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10831 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10832 SDValue Test = LHS.getOperand(0);
10833 uint64_t Mask = LHS.getConstantOperandVal(1);
10834 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
10835 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10836 Dest);
10837 }
10838
10839 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
10840 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10841 // Don't combine AND since emitComparison converts the AND to an ANDS
10842 // (a.k.a. TST) and the test in the test bit and branch instruction
10843 // becomes redundant. This would also increase register pressure.
10844 uint64_t SignBitPos;
10845 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10846 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
10847 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
10848 }
10849 }
10850 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10851 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10852 // Don't combine AND since emitComparison converts the AND to an ANDS
10853 // (a.k.a. TST) and the test in the test bit and branch instruction
10854 // becomes redundant. This would also increase register pressure.
10855 uint64_t SignBitPos;
10856 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10857 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
10858 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
10859 }
10860
10861 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
10862 // larger branch displacement but do prefer CB over cmp + br.
10863 if (Subtarget->hasCMPBR() &&
10865 ProduceNonFlagSettingCondBr) {
10866 SDValue Cond =
10868 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
10869 Dest);
10870 }
10871
10872 SDValue CCVal;
10873 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
10874 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10875 Cmp);
10876 }
10877
10878 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10879 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10880
10881 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10882 // clean. Some of them require two branches to implement.
10883 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
10884 AArch64CC::CondCode CC1, CC2;
10885 changeFPCCToAArch64CC(CC, CC1, CC2);
10886 SDValue CC1Val = getCondCode(DAG, CC1);
10887 SDValue BR1 =
10888 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
10889 if (CC2 != AArch64CC::AL) {
10890 SDValue CC2Val = getCondCode(DAG, CC2);
10891 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
10892 Cmp);
10893 }
10894
10895 return BR1;
10896}
10897
10898SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10899 SelectionDAG &DAG) const {
10900 if (!Subtarget->isNeonAvailable() &&
10901 !Subtarget->useSVEForFixedLengthVectors())
10902 return SDValue();
10903
10904 EVT VT = Op.getValueType();
10905 EVT IntVT = VT.changeTypeToInteger();
10906 SDLoc DL(Op);
10907
10908 SDValue In1 = Op.getOperand(0);
10909 SDValue In2 = Op.getOperand(1);
10910 EVT SrcVT = In2.getValueType();
10911
10912 if (!SrcVT.bitsEq(VT))
10913 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
10914
10915 if (VT.isScalableVector())
10916 IntVT =
10918
10919 if (VT.isFixedLengthVector() &&
10920 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
10921 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10922
10923 In1 = convertToScalableVector(DAG, ContainerVT, In1);
10924 In2 = convertToScalableVector(DAG, ContainerVT, In2);
10925
10926 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
10927 return convertFromScalableVector(DAG, VT, Res);
10928 }
10929
10930 // With SVE, but without Neon, extend the scalars to scalable vectors and use
10931 // a SVE FCOPYSIGN.
10932 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
10933 Subtarget->isSVEorStreamingSVEAvailable()) {
10934 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
10935 return SDValue();
10936 EVT SVT = getPackedSVEVectorVT(VT);
10937
10938 SDValue Ins1 =
10939 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
10940 DAG.getConstant(0, DL, MVT::i64));
10941 SDValue Ins2 =
10942 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
10943 DAG.getConstant(0, DL, MVT::i64));
10944 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
10945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
10946 DAG.getConstant(0, DL, MVT::i64));
10947 }
10948
10949 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10950 if (VT.isScalableVector())
10951 return getSVESafeBitCast(VT, Op, DAG);
10952
10953 return DAG.getBitcast(VT, Op);
10954 };
10955
10956 SDValue VecVal1, VecVal2;
10957 EVT VecVT;
10958 auto SetVecVal = [&](int Idx = -1) {
10959 if (!VT.isVector()) {
10960 VecVal1 =
10961 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
10962 VecVal2 =
10963 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
10964 } else {
10965 VecVal1 = BitCast(VecVT, In1, DAG);
10966 VecVal2 = BitCast(VecVT, In2, DAG);
10967 }
10968 };
10969 if (VT.isVector()) {
10970 VecVT = IntVT;
10971 SetVecVal();
10972 } else if (VT == MVT::f64) {
10973 VecVT = MVT::v2i64;
10974 SetVecVal(AArch64::dsub);
10975 } else if (VT == MVT::f32) {
10976 VecVT = MVT::v4i32;
10977 SetVecVal(AArch64::ssub);
10978 } else if (VT == MVT::f16 || VT == MVT::bf16) {
10979 VecVT = MVT::v8i16;
10980 SetVecVal(AArch64::hsub);
10981 } else {
10982 llvm_unreachable("Invalid type for copysign!");
10983 }
10984
10985 unsigned BitWidth = In1.getScalarValueSizeInBits();
10986 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10987
10988 // We want to materialize a mask with every bit but the high bit set, but the
10989 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10990 // 64-bit elements. Instead, materialize all bits set and then negate that.
10991 if (VT == MVT::f64 || VT == MVT::v2f64) {
10992 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10993 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10994 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10995 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10996 }
10997
10998 SDValue BSP =
10999 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11000 if (VT == MVT::f16 || VT == MVT::bf16)
11001 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11002 if (VT == MVT::f32)
11003 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11004 if (VT == MVT::f64)
11005 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11006
11007 return BitCast(VT, BSP, DAG);
11008}
11009
11010SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11011 SelectionDAG &DAG) const {
11013 Attribute::NoImplicitFloat))
11014 return SDValue();
11015
11016 EVT VT = Op.getValueType();
11017 if (VT.isScalableVector() ||
11018 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11019 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11020
11021 bool IsParity = Op.getOpcode() == ISD::PARITY;
11022 SDValue Val = Op.getOperand(0);
11023 SDLoc DL(Op);
11024
11025 // for i32, general parity function using EORs is more efficient compared to
11026 // using floating point
11027 if (VT == MVT::i32 && IsParity)
11028 return SDValue();
11029
11030 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11031 if (VT == MVT::i32 || VT == MVT::i64) {
11032 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11033 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11034 DAG.getUNDEF(ContainerVT), Val,
11035 DAG.getVectorIdxConstant(0, DL));
11036 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11037 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11038 DAG.getVectorIdxConstant(0, DL));
11039 if (IsParity)
11040 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11041 return Val;
11042 }
11043
11044 if (VT == MVT::i128) {
11045 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11046 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11047 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11048 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11049 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11050 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11051 if (IsParity)
11052 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11053 return Val;
11054 }
11055 }
11056
11057 if (!Subtarget->isNeonAvailable())
11058 return SDValue();
11059
11060 // If there is no CNT instruction available, GPR popcount can
11061 // be more efficiently lowered to the following sequence that uses
11062 // AdvSIMD registers/instructions as long as the copies to/from
11063 // the AdvSIMD registers are cheap.
11064 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11065 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11066 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11067 // FMOV X0, D0 // copy result back to integer reg
11068 if (VT == MVT::i32 || VT == MVT::i64) {
11069 if (VT == MVT::i32)
11070 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11071 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11072
11073 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11074 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11075 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11076 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11077 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11078 DAG.getConstant(0, DL, MVT::i64));
11079 if (IsParity)
11080 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11081 return AddV;
11082 } else if (VT == MVT::i128) {
11083 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11084
11085 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11086 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11087 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11088 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11089 DAG.getConstant(0, DL, MVT::i64));
11090 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11091 if (IsParity)
11092 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11093 return AddV;
11094 }
11095
11096 assert(!IsParity && "ISD::PARITY of vector types not supported");
11097
11098 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11099 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11100 "Unexpected type for custom ctpop lowering");
11101
11102 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11103 Val = DAG.getBitcast(VT8Bit, Val);
11104 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11105
11106 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11107 VT.getVectorNumElements() >= 2) {
11108 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11109 SDValue Zeros = DAG.getConstant(0, DL, DT);
11110 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11111
11112 if (VT == MVT::v2i64) {
11113 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11114 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11115 } else if (VT == MVT::v2i32) {
11116 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11117 } else if (VT == MVT::v4i32) {
11118 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11119 } else {
11120 llvm_unreachable("Unexpected type for custom ctpop lowering");
11121 }
11122
11123 return Val;
11124 }
11125
11126 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11127 unsigned EltSize = 8;
11128 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11129 while (EltSize != VT.getScalarSizeInBits()) {
11130 EltSize *= 2;
11131 NumElts /= 2;
11132 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11133 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11134 }
11135
11136 return Val;
11137}
11138
11139SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11140 EVT VT = Op.getValueType();
11141 assert(VT.isScalableVector() ||
11143 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11144
11145 SDLoc DL(Op);
11146 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11147 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11148}
11149
11150SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11151 SelectionDAG &DAG) const {
11152
11153 EVT VT = Op.getValueType();
11154 SDLoc DL(Op);
11155 unsigned Opcode = Op.getOpcode();
11156 ISD::CondCode CC;
11157 switch (Opcode) {
11158 default:
11159 llvm_unreachable("Wrong instruction");
11160 case ISD::SMAX:
11161 CC = ISD::SETGT;
11162 break;
11163 case ISD::SMIN:
11164 CC = ISD::SETLT;
11165 break;
11166 case ISD::UMAX:
11167 CC = ISD::SETUGT;
11168 break;
11169 case ISD::UMIN:
11170 CC = ISD::SETULT;
11171 break;
11172 }
11173
11174 if (VT.isScalableVector() ||
11176 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
11177 switch (Opcode) {
11178 default:
11179 llvm_unreachable("Wrong instruction");
11180 case ISD::SMAX:
11181 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11182 case ISD::SMIN:
11183 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11184 case ISD::UMAX:
11185 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11186 case ISD::UMIN:
11187 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11188 }
11189 }
11190
11191 SDValue Op0 = Op.getOperand(0);
11192 SDValue Op1 = Op.getOperand(1);
11193 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11194 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11195}
11196
11197SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11198 SelectionDAG &DAG) const {
11199 EVT VT = Op.getValueType();
11200
11201 if (VT.isScalableVector() ||
11203 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11204 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11205
11206 SDLoc DL(Op);
11207 SDValue REVB;
11208 MVT VST;
11209
11210 switch (VT.getSimpleVT().SimpleTy) {
11211 default:
11212 llvm_unreachable("Invalid type for bitreverse!");
11213
11214 case MVT::v2i32: {
11215 VST = MVT::v8i8;
11216 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11217
11218 break;
11219 }
11220
11221 case MVT::v4i32: {
11222 VST = MVT::v16i8;
11223 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11224
11225 break;
11226 }
11227
11228 case MVT::v1i64: {
11229 VST = MVT::v8i8;
11230 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11231
11232 break;
11233 }
11234
11235 case MVT::v2i64: {
11236 VST = MVT::v16i8;
11237 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11238
11239 break;
11240 }
11241 }
11242
11243 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11244 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11245}
11246
11247// Check whether the continuous comparison sequence.
11248static bool
11249isOrXorChain(SDValue N, unsigned &Num,
11250 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11251 if (Num == MaxXors)
11252 return false;
11253
11254 // Skip the one-use zext
11255 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11256 N = N->getOperand(0);
11257
11258 // The leaf node must be XOR
11259 if (N->getOpcode() == ISD::XOR) {
11260 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11261 Num++;
11262 return true;
11263 }
11264
11265 // All the non-leaf nodes must be OR.
11266 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11267 return false;
11268
11269 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11270 isOrXorChain(N->getOperand(1), Num, WorkList))
11271 return true;
11272 return false;
11273}
11274
11275// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11277 SDValue LHS = N->getOperand(0);
11278 SDValue RHS = N->getOperand(1);
11279 SDLoc DL(N);
11280 EVT VT = N->getValueType(0);
11282
11283 // Only handle integer compares.
11284 if (N->getOpcode() != ISD::SETCC)
11285 return SDValue();
11286
11287 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11288 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11289 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11290 unsigned NumXors = 0;
11291 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11292 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11293 isOrXorChain(LHS, NumXors, WorkList)) {
11294 SDValue XOR0, XOR1;
11295 std::tie(XOR0, XOR1) = WorkList[0];
11296 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11297 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11298 for (unsigned I = 1; I < WorkList.size(); I++) {
11299 std::tie(XOR0, XOR1) = WorkList[I];
11300 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11301 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11302 }
11303
11304 // Exit early by inverting the condition, which help reduce indentations.
11305 return Cmp;
11306 }
11307
11308 return SDValue();
11309}
11310
11311SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11312
11313 if (Op.getValueType().isVector())
11314 return LowerVSETCC(Op, DAG);
11315
11316 bool IsStrict = Op->isStrictFPOpcode();
11317 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11318 unsigned OpNo = IsStrict ? 1 : 0;
11319 SDValue Chain;
11320 if (IsStrict)
11321 Chain = Op.getOperand(0);
11322 SDValue LHS = Op.getOperand(OpNo + 0);
11323 SDValue RHS = Op.getOperand(OpNo + 1);
11324 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11325 SDLoc DL(Op);
11326
11327 // We chose ZeroOrOneBooleanContents, so use zero and one.
11328 EVT VT = Op.getValueType();
11329 SDValue TVal = DAG.getConstant(1, DL, VT);
11330 SDValue FVal = DAG.getConstant(0, DL, VT);
11331
11332 // Handle f128 first, since one possible outcome is a normal integer
11333 // comparison which gets picked up by the next if statement.
11334 if (LHS.getValueType() == MVT::f128) {
11335 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11336 IsSignaling);
11337
11338 // If softenSetCCOperands returned a scalar, use it.
11339 if (!RHS.getNode()) {
11340 assert(LHS.getValueType() == Op.getValueType() &&
11341 "Unexpected setcc expansion!");
11342 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11343 }
11344 }
11345
11346 if (LHS.getValueType().isInteger()) {
11347
11348 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11349
11350 SDValue CCVal;
11352 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11353
11354 // Note that we inverted the condition above, so we reverse the order of
11355 // the true and false operands here. This will allow the setcc to be
11356 // matched to a single CSINC instruction.
11357 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11358 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11359 }
11360
11361 // Now we know we're dealing with FP values.
11362 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11363 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11364
11365 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11366 // and do the comparison.
11367 SDValue Cmp;
11368 if (IsStrict)
11369 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11370 else
11371 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11372
11373 AArch64CC::CondCode CC1, CC2;
11374 changeFPCCToAArch64CC(CC, CC1, CC2);
11375 SDValue Res;
11376 if (CC2 == AArch64CC::AL) {
11377 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11378 CC2);
11379 SDValue CC1Val = getCondCode(DAG, CC1);
11380
11381 // Note that we inverted the condition above, so we reverse the order of
11382 // the true and false operands here. This will allow the setcc to be
11383 // matched to a single CSINC instruction.
11384 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11385 } else {
11386 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11387 // totally clean. Some of them require two CSELs to implement. As is in
11388 // this case, we emit the first CSEL and then emit a second using the output
11389 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11390
11391 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11392 SDValue CC1Val = getCondCode(DAG, CC1);
11393 SDValue CS1 =
11394 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11395
11396 SDValue CC2Val = getCondCode(DAG, CC2);
11397 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11398 }
11399 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11400}
11401
11402SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11403 SelectionDAG &DAG) const {
11404
11405 SDValue LHS = Op.getOperand(0);
11406 SDValue RHS = Op.getOperand(1);
11407 EVT VT = LHS.getValueType();
11408 if (VT != MVT::i32 && VT != MVT::i64)
11409 return SDValue();
11410
11411 SDLoc DL(Op);
11412 SDValue Carry = Op.getOperand(2);
11413 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11414 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11415 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11416 LHS, RHS, InvCarry);
11417
11418 EVT OpVT = Op.getValueType();
11419 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11420 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11421
11422 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11424 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11425 // Inputs are swapped because the condition is inverted. This will allow
11426 // matching with a single CSINC instruction.
11427 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11428 Cmp.getValue(1));
11429}
11430
11431/// Emit vector comparison for floating-point values, producing a mask.
11433 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11434 const SDLoc &DL, SelectionDAG &DAG) {
11435 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11436 "function only supposed to emit natural comparisons");
11437
11438 switch (CC) {
11439 default:
11440 return SDValue();
11441 case AArch64CC::NE: {
11442 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11443 // Use vector semantics for the inversion to potentially save a copy between
11444 // SIMD and regular registers.
11445 if (!LHS.getValueType().isVector()) {
11446 EVT VecVT =
11447 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11448 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11449 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11450 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11451 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11452 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11453 }
11454 return DAG.getNOT(DL, Fcmeq, VT);
11455 }
11456 case AArch64CC::EQ:
11457 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11458 case AArch64CC::GE:
11459 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11460 case AArch64CC::GT:
11461 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11462 case AArch64CC::LE:
11463 if (!NoNans)
11464 return SDValue();
11465 // If we ignore NaNs then we can use to the LS implementation.
11466 [[fallthrough]];
11467 case AArch64CC::LS:
11468 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11469 case AArch64CC::LT:
11470 if (!NoNans)
11471 return SDValue();
11472 // If we ignore NaNs then we can use to the MI implementation.
11473 [[fallthrough]];
11474 case AArch64CC::MI:
11475 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11476 }
11477}
11478
11479/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11480/// values are scalars, try to emit a mask generating vector instruction.
11482 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11483 const SDLoc &DL, SelectionDAG &DAG) {
11484 assert(!LHS.getValueType().isVector());
11485 assert(!RHS.getValueType().isVector());
11486
11487 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11488 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11489 if (!CTVal || !CFVal)
11490 return {};
11491 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11492 !(CTVal->isZero() && CFVal->isAllOnes()))
11493 return {};
11494
11495 if (CTVal->isZero())
11496 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11497
11498 EVT VT = TVal.getValueType();
11499 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11500 return {};
11501
11502 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11503 bool OneNaN = false;
11504 if (LHS == RHS) {
11505 OneNaN = true;
11506 } else if (DAG.isKnownNeverNaN(RHS)) {
11507 OneNaN = true;
11508 RHS = LHS;
11509 } else if (DAG.isKnownNeverNaN(LHS)) {
11510 OneNaN = true;
11511 LHS = RHS;
11512 }
11513 if (OneNaN)
11514 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11515 }
11516
11519 bool ShouldInvert = false;
11520 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11521 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
11522 SDValue Cmp2;
11523 if (CC2 != AArch64CC::AL) {
11524 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
11525 if (!Cmp2)
11526 return {};
11527 }
11528 if (!Cmp2 && !ShouldInvert)
11529 return Cmp;
11530
11531 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11532 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11533 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
11534 Zero);
11535 if (Cmp2) {
11536 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
11537 Cmp2, Zero);
11538 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
11539 }
11540 if (ShouldInvert)
11541 Cmp = DAG.getNOT(DL, Cmp, VecVT);
11542 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
11543 return Cmp;
11544}
11545
11546SDValue AArch64TargetLowering::LowerSELECT_CC(
11547 ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
11549 const SDLoc &DL, SelectionDAG &DAG) const {
11550 // Handle f128 first, because it will result in a comparison of some RTLIB
11551 // call result against zero.
11552 if (LHS.getValueType() == MVT::f128) {
11553 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11554
11555 // If softenSetCCOperands returned a scalar, we need to compare the result
11556 // against zero to select between true and false values.
11557 if (!RHS.getNode()) {
11558 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11559 CC = ISD::SETNE;
11560 }
11561 }
11562
11563 // Also handle f16, for which we need to do a f32 comparison.
11564 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11565 LHS.getValueType() == MVT::bf16) {
11566 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
11567 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
11568 }
11569
11570 // Next, handle integers.
11571 if (LHS.getValueType().isInteger()) {
11572 assert((LHS.getValueType() == RHS.getValueType()) &&
11573 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11574
11575 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11576 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11577 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11578
11579 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11580 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11581 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11582 // Both require less instructions than compare and conditional select.
11583 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11584 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11585 LHS.getValueType() == RHS.getValueType()) {
11586 EVT VT = LHS.getValueType();
11587 SDValue Shift =
11588 DAG.getNode(ISD::SRA, DL, VT, LHS,
11589 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
11590
11591 if (CC == ISD::SETGT)
11592 Shift = DAG.getNOT(DL, Shift, VT);
11593
11594 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
11595 }
11596
11597 // Canonicalise absolute difference patterns:
11598 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
11599 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
11600 //
11601 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
11602 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
11603 // The second forms can be matched into subs+cneg.
11604 // NOTE: Drop poison generating flags from the negated operand to avoid
11605 // inadvertently propagating poison after the canonicalisation.
11606 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
11607 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
11608 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
11610 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
11611 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
11612 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
11614 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
11615 }
11616 }
11617
11618 unsigned Opcode = AArch64ISD::CSEL;
11619
11620 // If both the TVal and the FVal are constants, see if we can swap them in
11621 // order to for a CSINV or CSINC out of them.
11622 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11623 std::swap(TVal, FVal);
11624 std::swap(CTVal, CFVal);
11625 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11626 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11627 std::swap(TVal, FVal);
11628 std::swap(CTVal, CFVal);
11629 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11630 } else if (TVal.getOpcode() == ISD::XOR) {
11631 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11632 // with a CSINV rather than a CSEL.
11633 if (isAllOnesConstant(TVal.getOperand(1))) {
11634 std::swap(TVal, FVal);
11635 std::swap(CTVal, CFVal);
11636 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11637 }
11638 } else if (TVal.getOpcode() == ISD::SUB) {
11639 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11640 // that we can match with a CSNEG rather than a CSEL.
11641 if (isNullConstant(TVal.getOperand(0))) {
11642 std::swap(TVal, FVal);
11643 std::swap(CTVal, CFVal);
11644 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11645 }
11646 } else if (CTVal && CFVal) {
11647 const int64_t TrueVal = CTVal->getSExtValue();
11648 const int64_t FalseVal = CFVal->getSExtValue();
11649 bool Swap = false;
11650
11651 // If both TVal and FVal are constants, see if FVal is the
11652 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11653 // instead of a CSEL in that case.
11654 if (TrueVal == ~FalseVal) {
11655 Opcode = AArch64ISD::CSINV;
11656 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11657 TrueVal == -FalseVal) {
11658 Opcode = AArch64ISD::CSNEG;
11659 } else if (TVal.getValueType() == MVT::i32) {
11660 // If our operands are only 32-bit wide, make sure we use 32-bit
11661 // arithmetic for the check whether we can use CSINC. This ensures that
11662 // the addition in the check will wrap around properly in case there is
11663 // an overflow (which would not be the case if we do the check with
11664 // 64-bit arithmetic).
11665 const uint32_t TrueVal32 = CTVal->getZExtValue();
11666 const uint32_t FalseVal32 = CFVal->getZExtValue();
11667
11668 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11669 Opcode = AArch64ISD::CSINC;
11670
11671 if (TrueVal32 > FalseVal32) {
11672 Swap = true;
11673 }
11674 }
11675 } else {
11676 // 64-bit check whether we can use CSINC.
11677 const uint64_t TrueVal64 = TrueVal;
11678 const uint64_t FalseVal64 = FalseVal;
11679
11680 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11681 Opcode = AArch64ISD::CSINC;
11682
11683 if (TrueVal > FalseVal) {
11684 Swap = true;
11685 }
11686 }
11687 }
11688
11689 // Swap TVal and FVal if necessary.
11690 if (Swap) {
11691 std::swap(TVal, FVal);
11692 std::swap(CTVal, CFVal);
11693 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11694 }
11695
11696 if (Opcode != AArch64ISD::CSEL) {
11697 // Drop FVal since we can get its value by simply inverting/negating
11698 // TVal.
11699 FVal = TVal;
11700 }
11701 }
11702
11703 // Avoid materializing a constant when possible by reusing a known value in
11704 // a register. However, don't perform this optimization if the known value
11705 // is one, zero or negative one in the case of a CSEL. We can always
11706 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11707 // FVal, respectively.
11708 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11709 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11710 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11712 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11713 // "a != C ? x : a" to avoid materializing C.
11714 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11715 TVal = LHS;
11716 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11717 FVal = LHS;
11718 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11719 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11720 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11721 // avoid materializing C.
11723 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11724 Opcode = AArch64ISD::CSINV;
11725 TVal = LHS;
11726 FVal = DAG.getConstant(0, DL, FVal.getValueType());
11727 }
11728 }
11729
11730 SDValue CCVal;
11731 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11732 EVT VT = TVal.getValueType();
11733 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
11734 }
11735
11736 // Now we know we're dealing with FP values.
11737 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11738 LHS.getValueType() == MVT::f64);
11739 assert(LHS.getValueType() == RHS.getValueType());
11740 EVT VT = TVal.getValueType();
11741
11742 // If the purpose of the comparison is to select between all ones
11743 // or all zeros, try to use a vector comparison because the operands are
11744 // already stored in SIMD registers.
11745 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
11746 switch (U->getOpcode()) {
11747 default:
11748 return false;
11749 case ISD::INSERT_VECTOR_ELT:
11750 case ISD::SCALAR_TO_VECTOR:
11751 case AArch64ISD::DUP:
11752 return true;
11753 }
11754 })) {
11755 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
11756 SDValue VectorCmp =
11757 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11758 if (VectorCmp)
11759 return VectorCmp;
11760 }
11761
11762 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11763
11764 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11765 // clean. Some of them require two CSELs to implement.
11766 AArch64CC::CondCode CC1, CC2;
11767 changeFPCCToAArch64CC(CC, CC1, CC2);
11768
11769 if (Flags.hasNoSignedZeros()) {
11770 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11771 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11772 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11773 if (RHSVal && RHSVal->isZero()) {
11774 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11775 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11776
11777 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11778 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11779 TVal = LHS;
11780 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11781 CFVal && CFVal->isZero() &&
11782 FVal.getValueType() == LHS.getValueType())
11783 FVal = LHS;
11784 }
11785 }
11786
11787 // Emit first, and possibly only, CSEL.
11788 SDValue CC1Val = getCondCode(DAG, CC1);
11789 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11790
11791 // If we need a second CSEL, emit it, using the output of the first as the
11792 // RHS. We're effectively OR'ing the two CC's together.
11793 if (CC2 != AArch64CC::AL) {
11794 SDValue CC2Val = getCondCode(DAG, CC2);
11795 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11796 }
11797
11798 // Otherwise, return the output of the first CSEL.
11799 return CS1;
11800}
11801
11802SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11803 SelectionDAG &DAG) const {
11804 EVT Ty = Op.getValueType();
11805 auto Idx = Op.getConstantOperandAPInt(2);
11806 int64_t IdxVal = Idx.getSExtValue();
11807 assert(Ty.isScalableVector() &&
11808 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11809
11810 // We can use the splice instruction for certain index values where we are
11811 // able to efficiently generate the correct predicate. The index will be
11812 // inverted and used directly as the input to the ptrue instruction, i.e.
11813 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11814 // splice predicate. However, we can only do this if we can guarantee that
11815 // there are enough elements in the vector, hence we check the index <= min
11816 // number of elements.
11817 std::optional<unsigned> PredPattern;
11818 if (Ty.isScalableVector() && IdxVal < 0 &&
11819 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11820 std::nullopt) {
11821 SDLoc DL(Op);
11822
11823 // Create a predicate where all but the last -IdxVal elements are false.
11824 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11825 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11826 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11827
11828 // Now splice the two inputs together using the predicate.
11829 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11830 Op.getOperand(1));
11831 }
11832
11833 // We can select to an EXT instruction when indexing the first 256 bytes.
11835 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11836 return Op;
11837
11838 return SDValue();
11839}
11840
11841SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11842 SelectionDAG &DAG) const {
11843 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11844 SDValue LHS = Op.getOperand(0);
11845 SDValue RHS = Op.getOperand(1);
11846 SDValue TVal = Op.getOperand(2);
11847 SDValue FVal = Op.getOperand(3);
11848 SDNodeFlags Flags = Op->getFlags();
11849 SDLoc DL(Op);
11850 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
11851}
11852
11853SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11854 SelectionDAG &DAG) const {
11855 SDValue CCVal = Op->getOperand(0);
11856 SDValue TVal = Op->getOperand(1);
11857 SDValue FVal = Op->getOperand(2);
11858 SDLoc DL(Op);
11859
11860 EVT Ty = Op.getValueType();
11861 if (Ty == MVT::aarch64svcount) {
11862 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
11863 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
11864 SDValue Sel =
11865 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
11866 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
11867 }
11868
11869 if (Ty.isScalableVector()) {
11870 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
11871 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
11872 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11873 }
11874
11875 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
11876 // FIXME: Ideally this would be the same as above using i1 types, however
11877 // for the moment we can't deal with fixed i1 vector types properly, so
11878 // instead extend the predicate to a result type sized integer vector.
11879 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
11880 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
11881 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
11882 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
11883 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11884 }
11885
11886 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11887 // instruction.
11888 if (ISD::isOverflowIntrOpRes(CCVal)) {
11889 // Only lower legal XALUO ops.
11890 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
11891 return SDValue();
11892
11894 SDValue Value, Overflow;
11895 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
11896 SDValue CCVal = getCondCode(DAG, OFCC);
11897
11898 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
11899 CCVal, Overflow);
11900 }
11901
11902 // Lower it the same way as we would lower a SELECT_CC node.
11903 ISD::CondCode CC;
11904 SDValue LHS, RHS;
11905 if (CCVal.getOpcode() == ISD::SETCC) {
11906 LHS = CCVal.getOperand(0);
11907 RHS = CCVal.getOperand(1);
11908 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
11909 } else {
11910 LHS = CCVal;
11911 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
11912 CC = ISD::SETNE;
11913 }
11914
11915 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11916 // order to use FCSELSrrr
11917 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11918 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11919 DAG.getUNDEF(MVT::f32), TVal);
11920 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11921 DAG.getUNDEF(MVT::f32), FVal);
11922 }
11923
11924 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
11925 Op->getFlags(), DL, DAG);
11926
11927 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11928 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
11929 }
11930
11931 return Res;
11932}
11933
11934SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11935 SelectionDAG &DAG) const {
11936 // Jump table entries as PC relative offsets. No additional tweaking
11937 // is necessary here. Just get the address of the jump table.
11938 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11939
11942 !Subtarget->isTargetMachO())
11943 return getAddrLarge(JT, DAG);
11944 if (CM == CodeModel::Tiny)
11945 return getAddrTiny(JT, DAG);
11946 return getAddr(JT, DAG);
11947}
11948
11949SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11950 SelectionDAG &DAG) const {
11951 // Jump table entries as PC relative offsets. No additional tweaking
11952 // is necessary here. Just get the address of the jump table.
11953 SDLoc DL(Op);
11954 SDValue JT = Op.getOperand(1);
11955 SDValue Entry = Op.getOperand(2);
11956 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
11957
11958 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11959 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
11960
11961 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11962 // sequence later, to guarantee the integrity of the intermediate values.
11964 "aarch64-jump-table-hardening")) {
11966 if (Subtarget->isTargetMachO()) {
11967 if (CM != CodeModel::Small && CM != CodeModel::Large)
11968 report_fatal_error("Unsupported code-model for hardened jump-table");
11969 } else {
11970 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11971 assert(Subtarget->isTargetELF() &&
11972 "jump table hardening only supported on MachO/ELF");
11973 if (CM != CodeModel::Small)
11974 report_fatal_error("Unsupported code-model for hardened jump-table");
11975 }
11976
11977 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
11978 Entry, SDValue());
11979 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
11980 DAG.getTargetJumpTable(JTI, MVT::i32),
11981 X16Copy.getValue(0), X16Copy.getValue(1));
11982 return SDValue(B, 0);
11983 }
11984
11985 SDNode *Dest =
11986 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
11987 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
11988 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
11989 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
11990}
11991
11992SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11993 SDValue Chain = Op.getOperand(0);
11994 SDValue Dest = Op.getOperand(1);
11995
11996 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11997 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11998 if (Dest->isMachineOpcode() &&
11999 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12000 return SDValue();
12001
12002 const MachineFunction &MF = DAG.getMachineFunction();
12003 std::optional<uint16_t> BADisc =
12005 if (!BADisc)
12006 return SDValue();
12007
12008 SDLoc DL(Op);
12009
12010 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12012 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12013
12014 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12015 {Dest, Key, Disc, AddrDisc, Chain});
12016 return SDValue(BrA, 0);
12017}
12018
12019SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12020 SelectionDAG &DAG) const {
12021 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12023 if (CM == CodeModel::Large) {
12024 // Use the GOT for the large code model on iOS.
12025 if (Subtarget->isTargetMachO()) {
12026 return getGOT(CP, DAG);
12027 }
12029 return getAddrLarge(CP, DAG);
12030 } else if (CM == CodeModel::Tiny) {
12031 return getAddrTiny(CP, DAG);
12032 }
12033 return getAddr(CP, DAG);
12034}
12035
12036SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12037 SelectionDAG &DAG) const {
12038 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12039 const BlockAddress *BA = BAN->getBlockAddress();
12040
12041 if (std::optional<uint16_t> BADisc =
12043 *BA->getFunction())) {
12044 SDLoc DL(Op);
12045
12046 // This isn't cheap, but BRIND is rare.
12047 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12048
12049 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12050
12052 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12053
12054 SDNode *MOV =
12055 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12056 {TargetBA, Key, AddrDisc, Disc});
12057 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12058 SDValue(MOV, 1));
12059 }
12060
12062 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12064 return getAddrLarge(BAN, DAG);
12065 } else if (CM == CodeModel::Tiny) {
12066 return getAddrTiny(BAN, DAG);
12067 }
12068 return getAddr(BAN, DAG);
12069}
12070
12071SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12072 SelectionDAG &DAG) const {
12073 AArch64FunctionInfo *FuncInfo =
12075
12076 SDLoc DL(Op);
12077 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12079 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12080 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12081 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12082 MachinePointerInfo(SV));
12083}
12084
12085SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12086 SelectionDAG &DAG) const {
12089
12090 SDLoc DL(Op);
12091 SDValue FR;
12092 if (Subtarget->isWindowsArm64EC()) {
12093 // With the Arm64EC ABI, we compute the address of the varargs save area
12094 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12095 // but calls from an entry thunk can pass in a different address.
12096 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12097 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12099 if (FuncInfo->getVarArgsGPRSize() > 0)
12100 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12101 else
12102 StackOffset = FuncInfo->getVarArgsStackOffset();
12103 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12104 DAG.getConstant(StackOffset, DL, MVT::i64));
12105 } else {
12106 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12107 ? FuncInfo->getVarArgsGPRIndex()
12108 : FuncInfo->getVarArgsStackIndex(),
12110 }
12111 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12112 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12113 MachinePointerInfo(SV));
12114}
12115
12116SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12117 SelectionDAG &DAG) const {
12118 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12119 // Standard, section B.3.
12122 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12123 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12124 auto PtrVT = getPointerTy(DAG.getDataLayout());
12125 SDLoc DL(Op);
12126
12127 SDValue Chain = Op.getOperand(0);
12128 SDValue VAList = Op.getOperand(1);
12129 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12131
12132 // void *__stack at offset 0
12133 unsigned Offset = 0;
12134 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12135 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12136 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12137 MachinePointerInfo(SV), Align(PtrSize)));
12138
12139 // void *__gr_top at offset 8 (4 on ILP32)
12140 Offset += PtrSize;
12141 int GPRSize = FuncInfo->getVarArgsGPRSize();
12142 if (GPRSize > 0) {
12143 SDValue GRTop, GRTopAddr;
12144
12145 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12146 DAG.getConstant(Offset, DL, PtrVT));
12147
12148 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12149 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12150 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12151 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12152
12153 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12155 Align(PtrSize)));
12156 }
12157
12158 // void *__vr_top at offset 16 (8 on ILP32)
12159 Offset += PtrSize;
12160 int FPRSize = FuncInfo->getVarArgsFPRSize();
12161 if (FPRSize > 0) {
12162 SDValue VRTop, VRTopAddr;
12163 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12164 DAG.getConstant(Offset, DL, PtrVT));
12165
12166 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12167 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12168 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12169 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12170
12171 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12173 Align(PtrSize)));
12174 }
12175
12176 // int __gr_offs at offset 24 (12 on ILP32)
12177 Offset += PtrSize;
12178 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12179 DAG.getConstant(Offset, DL, PtrVT));
12180 MemOps.push_back(
12181 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12182 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12183
12184 // int __vr_offs at offset 28 (16 on ILP32)
12185 Offset += 4;
12186 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12187 DAG.getConstant(Offset, DL, PtrVT));
12188 MemOps.push_back(
12189 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12190 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12191
12192 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12193}
12194
12195SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12196 SelectionDAG &DAG) const {
12198 Function &F = MF.getFunction();
12199
12200 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12201 return LowerWin64_VASTART(Op, DAG);
12202 else if (Subtarget->isTargetDarwin())
12203 return LowerDarwin_VASTART(Op, DAG);
12204 else
12205 return LowerAAPCS_VASTART(Op, DAG);
12206}
12207
12208SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12209 SelectionDAG &DAG) const {
12210 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12211 // pointer.
12212 SDLoc DL(Op);
12213 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12214 unsigned VaListSize =
12215 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12216 ? PtrSize
12217 : Subtarget->isTargetILP32() ? 20 : 32;
12218 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12219 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12220
12221 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12222 DAG.getConstant(VaListSize, DL, MVT::i32),
12223 Align(PtrSize), false, false, /*CI=*/nullptr,
12224 std::nullopt, MachinePointerInfo(DestSV),
12225 MachinePointerInfo(SrcSV));
12226}
12227
12228SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12229 assert(Subtarget->isTargetDarwin() &&
12230 "automatic va_arg instruction only works on Darwin");
12231
12232 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12233 EVT VT = Op.getValueType();
12234 SDLoc DL(Op);
12235 SDValue Chain = Op.getOperand(0);
12236 SDValue Addr = Op.getOperand(1);
12237 MaybeAlign Align(Op.getConstantOperandVal(3));
12238 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12239 auto PtrVT = getPointerTy(DAG.getDataLayout());
12240 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12241 SDValue VAList =
12242 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12243 Chain = VAList.getValue(1);
12244 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12245
12246 if (VT.isScalableVector())
12247 report_fatal_error("Passing SVE types to variadic functions is "
12248 "currently not supported");
12249
12250 if (Align && *Align > MinSlotSize) {
12251 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12252 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12253 VAList =
12254 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12255 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12256 }
12257
12258 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12259 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12260
12261 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12262 // up to 64 bits. At the very least, we have to increase the striding of the
12263 // vaargs list to match this, and for FP values we need to introduce
12264 // FP_ROUND nodes as well.
12265 if (VT.isInteger() && !VT.isVector())
12266 ArgSize = std::max(ArgSize, MinSlotSize);
12267 bool NeedFPTrunc = false;
12268 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12269 ArgSize = 8;
12270 NeedFPTrunc = true;
12271 }
12272
12273 // Increment the pointer, VAList, to the next vaarg
12274 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12275 DAG.getConstant(ArgSize, DL, PtrVT));
12276 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12277
12278 // Store the incremented VAList to the legalized pointer
12279 SDValue APStore =
12280 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12281
12282 // Load the actual argument out of the pointer VAList
12283 if (NeedFPTrunc) {
12284 // Load the value as an f64.
12285 SDValue WideFP =
12286 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12287 // Round the value down to an f32.
12288 SDValue NarrowFP =
12289 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12290 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12291 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12292 // Merge the rounded value with the chain output of the load.
12293 return DAG.getMergeValues(Ops, DL);
12294 }
12295
12296 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12297}
12298
12299SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12300 SelectionDAG &DAG) const {
12302 MFI.setFrameAddressIsTaken(true);
12303
12304 EVT VT = Op.getValueType();
12305 SDLoc DL(Op);
12306 unsigned Depth = Op.getConstantOperandVal(0);
12307 SDValue FrameAddr =
12308 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12309 while (Depth--)
12310 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12312
12313 if (Subtarget->isTargetILP32())
12314 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12315 DAG.getValueType(VT));
12316
12317 return FrameAddr;
12318}
12319
12320SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12321 SelectionDAG &DAG) const {
12323
12324 EVT VT = getPointerTy(DAG.getDataLayout());
12325 int FI = MFI.CreateFixedObject(4, 0, false);
12326 return DAG.getFrameIndex(FI, VT);
12327}
12328
12329#define GET_REGISTER_MATCHER
12330#include "AArch64GenAsmMatcher.inc"
12331
12332// FIXME? Maybe this could be a TableGen attribute on some registers and
12333// this table could be generated automatically from RegInfo.
12334Register AArch64TargetLowering::
12335getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12337 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12338 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12339 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12340 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12341 !MRI->isReservedReg(MF, Reg))
12342 Reg = Register();
12343 }
12344 return Reg;
12345}
12346
12347SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12348 SelectionDAG &DAG) const {
12350
12351 EVT VT = Op.getValueType();
12352 SDLoc DL(Op);
12353
12354 SDValue FrameAddr =
12355 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12357
12358 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12359}
12360
12361SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12362 SelectionDAG &DAG) const {
12364 MachineFrameInfo &MFI = MF.getFrameInfo();
12365 MFI.setReturnAddressIsTaken(true);
12366
12367 EVT VT = Op.getValueType();
12368 SDLoc DL(Op);
12369 unsigned Depth = Op.getConstantOperandVal(0);
12370 SDValue ReturnAddress;
12371 if (Depth) {
12372 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12374 ReturnAddress = DAG.getLoad(
12375 VT, DL, DAG.getEntryNode(),
12376 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12377 } else {
12378 // Return LR, which contains the return address. Mark it an implicit
12379 // live-in.
12380 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12381 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12382 }
12383
12384 // The XPACLRI instruction assembles to a hint-space instruction before
12385 // Armv8.3-A therefore this instruction can be safely used for any pre
12386 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12387 // that instead.
12388 SDNode *St;
12389 if (Subtarget->hasPAuth()) {
12390 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12391 } else {
12392 // XPACLRI operates on LR therefore we must move the operand accordingly.
12393 SDValue Chain =
12394 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12395 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12396 }
12397 return SDValue(St, 0);
12398}
12399
12400/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12401/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12402SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12403 SelectionDAG &DAG) const {
12404 SDValue Lo, Hi;
12405 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12406 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12407}
12408
12410 const GlobalAddressSDNode *GA) const {
12411 // Offsets are folded in the DAG combine rather than here so that we can
12412 // intelligently choose an offset based on the uses.
12413 return false;
12414}
12415
12417 bool OptForSize) const {
12418 bool IsLegal = false;
12419 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12420 // 16-bit case when target has full fp16 support.
12421 // We encode bf16 bit patterns as if they were fp16. This results in very
12422 // strange looking assembly but should populate the register with appropriate
12423 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12424 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12425 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12426 // FIXME: We should be able to handle f128 as well with a clever lowering.
12427 const APInt ImmInt = Imm.bitcastToAPInt();
12428 if (VT == MVT::f64)
12429 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12430 else if (VT == MVT::f32)
12431 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12432 else if (VT == MVT::f16 || VT == MVT::bf16)
12433 IsLegal =
12434 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12435 Imm.isPosZero();
12436
12437 // If we can not materialize in immediate field for fmov, check if the
12438 // value can be encoded as the immediate operand of a logical instruction.
12439 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12440 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12441 // generate that fmov.
12442 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12443 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12444 // however the mov+fmov sequence is always better because of the reduced
12445 // cache pressure. The timings are still the same if you consider
12446 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12447 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12450 assert(Insn.size() <= 4 &&
12451 "Should be able to build any value with at most 4 moves");
12452 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12453 IsLegal = Insn.size() <= Limit;
12454 }
12455
12456 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12457 << " imm value: "; Imm.dump(););
12458 return IsLegal;
12459}
12460
12461//===----------------------------------------------------------------------===//
12462// AArch64 Optimization Hooks
12463//===----------------------------------------------------------------------===//
12464
12465static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12466 SDValue Operand, SelectionDAG &DAG,
12467 int &ExtraSteps) {
12468 EVT VT = Operand.getValueType();
12469 if ((ST->hasNEON() &&
12470 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12471 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12472 VT == MVT::v4f32)) ||
12473 (ST->hasSVE() &&
12474 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12476 // For the reciprocal estimates, convergence is quadratic, so the number
12477 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12478 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12479 // the result for float (23 mantissa bits) is 2 and for double (52
12480 // mantissa bits) is 3.
12481 constexpr unsigned AccurateBits = 8;
12482 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12483 ExtraSteps = DesiredBits <= AccurateBits
12484 ? 0
12485 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12486 }
12487
12488 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12489 }
12490
12491 return SDValue();
12492}
12493
12494SDValue
12495AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12496 const DenormalMode &Mode) const {
12497 SDLoc DL(Op);
12498 EVT VT = Op.getValueType();
12499 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12500 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12501 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12502}
12503
12504SDValue
12505AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12506 SelectionDAG &DAG) const {
12507 return Op;
12508}
12509
12510SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12511 SelectionDAG &DAG, int Enabled,
12512 int &ExtraSteps,
12513 bool &UseOneConst,
12514 bool Reciprocal) const {
12516 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12517 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12518 DAG, ExtraSteps)) {
12519 SDLoc DL(Operand);
12520 EVT VT = Operand.getValueType();
12521
12522 // Ensure nodes can be recognized by isAssociativeAndCommutative.
12525
12526 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12527 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12528 for (int i = ExtraSteps; i > 0; --i) {
12529 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12530 Flags);
12531 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12532 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12533 }
12534 if (!Reciprocal)
12535 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12536
12537 ExtraSteps = 0;
12538 return Estimate;
12539 }
12540
12541 return SDValue();
12542}
12543
12544SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12545 SelectionDAG &DAG, int Enabled,
12546 int &ExtraSteps) const {
12548 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12549 DAG, ExtraSteps)) {
12550 SDLoc DL(Operand);
12551 EVT VT = Operand.getValueType();
12552
12554
12555 // Newton reciprocal iteration: E * (2 - X * E)
12556 // AArch64 reciprocal iteration instruction: (2 - M * N)
12557 for (int i = ExtraSteps; i > 0; --i) {
12558 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12559 Estimate, Flags);
12560 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12561 }
12562
12563 ExtraSteps = 0;
12564 return Estimate;
12565 }
12566
12567 return SDValue();
12568}
12569
12570//===----------------------------------------------------------------------===//
12571// AArch64 Inline Assembly Support
12572//===----------------------------------------------------------------------===//
12573
12574// Table of Constraints
12575// TODO: This is the current set of constraints supported by ARM for the
12576// compiler, not all of them may make sense.
12577//
12578// r - A general register
12579// w - An FP/SIMD register of some size in the range v0-v31
12580// x - An FP/SIMD register of some size in the range v0-v15
12581// I - Constant that can be used with an ADD instruction
12582// J - Constant that can be used with a SUB instruction
12583// K - Constant that can be used with a 32-bit logical instruction
12584// L - Constant that can be used with a 64-bit logical instruction
12585// M - Constant that can be used as a 32-bit MOV immediate
12586// N - Constant that can be used as a 64-bit MOV immediate
12587// Q - A memory reference with base register and no offset
12588// S - A symbolic address
12589// Y - Floating point constant zero
12590// Z - Integer constant zero
12591//
12592// Note that general register operands will be output using their 64-bit x
12593// register name, whatever the size of the variable, unless the asm operand
12594// is prefixed by the %w modifier. Floating-point and SIMD register operands
12595// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12596// %q modifier.
12597const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12598 // At this point, we have to lower this constraint to something else, so we
12599 // lower it to an "r" or "w". However, by doing this we will force the result
12600 // to be in register, while the X constraint is much more permissive.
12601 //
12602 // Although we are correct (we are free to emit anything, without
12603 // constraints), we might break use cases that would expect us to be more
12604 // efficient and emit something else.
12605 if (!Subtarget->hasFPARMv8())
12606 return "r";
12607
12608 if (ConstraintVT.isFloatingPoint())
12609 return "w";
12610
12611 if (ConstraintVT.isVector() &&
12612 (ConstraintVT.getSizeInBits() == 64 ||
12613 ConstraintVT.getSizeInBits() == 128))
12614 return "w";
12615
12616 return "r";
12617}
12618
12620
12621// Returns a {Reg, RegisterClass} tuple if the constraint is
12622// a specific predicate register.
12623//
12624// For some constraint like "{pn3}" the default path in
12625// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12626// suitable register class for this register is "PPRorPNR", after which it
12627// determines that nxv16i1 is an appropriate type for the constraint, which is
12628// not what we want. The code here pre-empts this by matching the register
12629// explicitly.
12630static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12632 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12633 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12634 return std::nullopt;
12635
12636 bool IsPredicate = Constraint[1] == 'p';
12637 Constraint = Constraint.substr(2, Constraint.size() - 3);
12638 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
12639 if (IsPredicateAsCount)
12640 Constraint = Constraint.drop_front(1);
12641
12642 unsigned V;
12643 if (Constraint.getAsInteger(10, V) || V > 31)
12644 return std::nullopt;
12645
12646 if (IsPredicateAsCount)
12647 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12648 if (IsPredicate)
12649 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12650 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
12651}
12652
12653static std::optional<PredicateConstraint>
12656 .Case("Uph", PredicateConstraint::Uph)
12657 .Case("Upl", PredicateConstraint::Upl)
12658 .Case("Upa", PredicateConstraint::Upa)
12659 .Default(std::nullopt);
12660}
12661
12662static const TargetRegisterClass *
12664 if (VT != MVT::aarch64svcount &&
12665 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12666 return nullptr;
12667
12668 switch (Constraint) {
12669 case PredicateConstraint::Uph:
12670 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12671 : &AArch64::PPR_p8to15RegClass;
12672 case PredicateConstraint::Upl:
12673 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12674 : &AArch64::PPR_3bRegClass;
12675 case PredicateConstraint::Upa:
12676 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12677 : &AArch64::PPRRegClass;
12678 }
12679
12680 llvm_unreachable("Missing PredicateConstraint!");
12681}
12682
12684
12685static std::optional<ReducedGprConstraint>
12688 .Case("Uci", ReducedGprConstraint::Uci)
12689 .Case("Ucj", ReducedGprConstraint::Ucj)
12690 .Default(std::nullopt);
12691}
12692
12693static const TargetRegisterClass *
12695 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12696 return nullptr;
12697
12698 switch (Constraint) {
12699 case ReducedGprConstraint::Uci:
12700 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12701 case ReducedGprConstraint::Ucj:
12702 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12703 }
12704
12705 llvm_unreachable("Missing ReducedGprConstraint!");
12706}
12707
12708// The set of cc code supported is from
12709// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12712 .Case("{@cchi}", AArch64CC::HI)
12713 .Case("{@cccs}", AArch64CC::HS)
12714 .Case("{@cclo}", AArch64CC::LO)
12715 .Case("{@ccls}", AArch64CC::LS)
12716 .Case("{@cccc}", AArch64CC::LO)
12717 .Case("{@cceq}", AArch64CC::EQ)
12718 .Case("{@ccgt}", AArch64CC::GT)
12719 .Case("{@ccge}", AArch64CC::GE)
12720 .Case("{@cclt}", AArch64CC::LT)
12721 .Case("{@ccle}", AArch64CC::LE)
12722 .Case("{@cchs}", AArch64CC::HS)
12723 .Case("{@ccne}", AArch64CC::NE)
12724 .Case("{@ccvc}", AArch64CC::VC)
12725 .Case("{@ccpl}", AArch64CC::PL)
12726 .Case("{@ccvs}", AArch64CC::VS)
12727 .Case("{@ccmi}", AArch64CC::MI)
12729 return Cond;
12730}
12731
12732/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12733/// WZR, invert(<cond>)'.
12735 SelectionDAG &DAG) {
12736 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
12737 DAG.getConstant(0, DL, MVT::i32),
12738 DAG.getConstant(0, DL, MVT::i32),
12739 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
12740}
12741
12742// Lower @cc flag output via getSETCC.
12743SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12744 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12745 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12746 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12747 if (Cond == AArch64CC::Invalid)
12748 return SDValue();
12749 // The output variable should be a scalar integer.
12750 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12751 OpInfo.ConstraintVT.getSizeInBits() < 8)
12752 report_fatal_error("Flag output operand is of invalid type");
12753
12754 // Get NZCV register. Only update chain when copyfrom is glued.
12755 if (Glue.getNode()) {
12756 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
12757 Chain = Glue.getValue(1);
12758 } else
12759 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
12760 // Extract CC code.
12761 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12762
12764
12765 // Truncate or ZERO_EXTEND based on value types.
12766 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12767 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12768 else
12769 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12770
12771 return Result;
12772}
12773
12774/// getConstraintType - Given a constraint letter, return the type of
12775/// constraint it is for this target.
12777AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12778 if (Constraint.size() == 1) {
12779 switch (Constraint[0]) {
12780 default:
12781 break;
12782 case 'x':
12783 case 'w':
12784 case 'y':
12785 return C_RegisterClass;
12786 // An address with a single base register. Due to the way we
12787 // currently handle addresses it is the same as 'r'.
12788 case 'Q':
12789 return C_Memory;
12790 case 'I':
12791 case 'J':
12792 case 'K':
12793 case 'L':
12794 case 'M':
12795 case 'N':
12796 case 'Y':
12797 case 'Z':
12798 return C_Immediate;
12799 case 'z':
12800 case 'S': // A symbol or label reference with a constant offset
12801 return C_Other;
12802 }
12803 } else if (parsePredicateConstraint(Constraint))
12804 return C_RegisterClass;
12805 else if (parseReducedGprConstraint(Constraint))
12806 return C_RegisterClass;
12807 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12808 return C_Other;
12809 return TargetLowering::getConstraintType(Constraint);
12810}
12811
12812/// Examine constraint type and operand type and determine a weight value.
12813/// This object must already have been set up with the operand type
12814/// and the current alternative constraint selected.
12816AArch64TargetLowering::getSingleConstraintMatchWeight(
12817 AsmOperandInfo &info, const char *constraint) const {
12819 Value *CallOperandVal = info.CallOperandVal;
12820 // If we don't have a value, we can't do a match,
12821 // but allow it at the lowest weight.
12822 if (!CallOperandVal)
12823 return CW_Default;
12824 Type *type = CallOperandVal->getType();
12825 // Look at the constraint type.
12826 switch (*constraint) {
12827 default:
12829 break;
12830 case 'x':
12831 case 'w':
12832 case 'y':
12833 if (type->isFloatingPointTy() || type->isVectorTy())
12834 weight = CW_Register;
12835 break;
12836 case 'z':
12837 weight = CW_Constant;
12838 break;
12839 case 'U':
12840 if (parsePredicateConstraint(constraint) ||
12841 parseReducedGprConstraint(constraint))
12842 weight = CW_Register;
12843 break;
12844 }
12845 return weight;
12846}
12847
12848std::pair<unsigned, const TargetRegisterClass *>
12849AArch64TargetLowering::getRegForInlineAsmConstraint(
12850 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12851 if (Constraint.size() == 1) {
12852 switch (Constraint[0]) {
12853 case 'r':
12854 if (VT.isScalableVector())
12855 return std::make_pair(0U, nullptr);
12856 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12857 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
12858 if (VT.getFixedSizeInBits() == 64)
12859 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
12860 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
12861 case 'w': {
12862 if (!Subtarget->hasFPARMv8())
12863 break;
12864 if (VT.isScalableVector()) {
12865 if (VT.getVectorElementType() != MVT::i1)
12866 return std::make_pair(0U, &AArch64::ZPRRegClass);
12867 return std::make_pair(0U, nullptr);
12868 }
12869 if (VT == MVT::Other)
12870 break;
12871 uint64_t VTSize = VT.getFixedSizeInBits();
12872 if (VTSize == 16)
12873 return std::make_pair(0U, &AArch64::FPR16RegClass);
12874 if (VTSize == 32)
12875 return std::make_pair(0U, &AArch64::FPR32RegClass);
12876 if (VTSize == 64)
12877 return std::make_pair(0U, &AArch64::FPR64RegClass);
12878 if (VTSize == 128)
12879 return std::make_pair(0U, &AArch64::FPR128RegClass);
12880 break;
12881 }
12882 // The instructions that this constraint is designed for can
12883 // only take 128-bit registers so just use that regclass.
12884 case 'x':
12885 if (!Subtarget->hasFPARMv8())
12886 break;
12887 if (VT.isScalableVector())
12888 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
12889 if (VT.getSizeInBits() == 128)
12890 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
12891 break;
12892 case 'y':
12893 if (!Subtarget->hasFPARMv8())
12894 break;
12895 if (VT.isScalableVector())
12896 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
12897 break;
12898 }
12899 } else {
12900 if (const auto P = parseSVERegAsConstraint(Constraint)) {
12901 // SME functions that are not in streaming mode, should
12902 // still observe clobbers of Z-registers by clobbering
12903 // the lower 128bits of those registers.
12904 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
12905 !Subtarget->isSVEorStreamingSVEAvailable())
12906 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
12907 &AArch64::FPR128RegClass);
12908 return *P;
12909 }
12910 if (const auto PC = parsePredicateConstraint(Constraint))
12911 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
12912 return std::make_pair(0U, RegClass);
12913
12914 if (const auto RGC = parseReducedGprConstraint(Constraint))
12915 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
12916 return std::make_pair(0U, RegClass);
12917 }
12918 if (StringRef("{cc}").equals_insensitive(Constraint) ||
12920 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
12921
12922 if (Constraint == "{za}") {
12923 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
12924 }
12925
12926 if (Constraint == "{zt0}") {
12927 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
12928 }
12929
12930 // Use the default implementation in TargetLowering to convert the register
12931 // constraint into a member of a register class.
12932 std::pair<unsigned, const TargetRegisterClass *> Res;
12934
12935 // Not found as a standard register?
12936 if (!Res.second) {
12937 unsigned Size = Constraint.size();
12938 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12939 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12940 int RegNo;
12941 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
12942 if (!Failed && RegNo >= 0 && RegNo <= 31) {
12943 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12944 // By default we'll emit v0-v31 for this unless there's a modifier where
12945 // we'll emit the correct register as well.
12946 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12947 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
12948 Res.second = &AArch64::FPR64RegClass;
12949 } else {
12950 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
12951 Res.second = &AArch64::FPR128RegClass;
12952 }
12953 }
12954 }
12955 }
12956
12957 if (Res.second && !Subtarget->hasFPARMv8() &&
12958 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
12959 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
12960 return std::make_pair(0U, nullptr);
12961
12962 return Res;
12963}
12964
12966 llvm::Type *Ty,
12967 bool AllowUnknown) const {
12968 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
12969 return EVT(MVT::i64x8);
12970
12971 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12972}
12973
12974/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12975/// vector. If it is invalid, don't add anything to Ops.
12976void AArch64TargetLowering::LowerAsmOperandForConstraint(
12977 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12978 SelectionDAG &DAG) const {
12979 SDValue Result;
12980
12981 // Currently only support length 1 constraints.
12982 if (Constraint.size() != 1)
12983 return;
12984
12985 char ConstraintLetter = Constraint[0];
12986 switch (ConstraintLetter) {
12987 default:
12988 break;
12989
12990 // This set of constraints deal with valid constants for various instructions.
12991 // Validate and return a target constant for them if we can.
12992 case 'z': {
12993 // 'z' maps to xzr or wzr so it needs an input of 0.
12994 if (!isNullConstant(Op))
12995 return;
12996
12997 if (Op.getValueType() == MVT::i64)
12998 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
12999 else
13000 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13001 break;
13002 }
13003 case 'S':
13004 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13005 // supported for PIC while "s" isn't, making "s" less useful. We implement
13006 // "S" but not "s".
13008 break;
13009
13010 case 'I':
13011 case 'J':
13012 case 'K':
13013 case 'L':
13014 case 'M':
13015 case 'N':
13016 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
13017 if (!C)
13018 return;
13019
13020 // Grab the value and do some validation.
13021 uint64_t CVal = C->getZExtValue();
13022 switch (ConstraintLetter) {
13023 // The I constraint applies only to simple ADD or SUB immediate operands:
13024 // i.e. 0 to 4095 with optional shift by 12
13025 // The J constraint applies only to ADD or SUB immediates that would be
13026 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13027 // instruction [or vice versa], in other words -1 to -4095 with optional
13028 // left shift by 12.
13029 case 'I':
13030 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13031 break;
13032 return;
13033 case 'J': {
13034 uint64_t NVal = -C->getSExtValue();
13035 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13036 CVal = C->getSExtValue();
13037 break;
13038 }
13039 return;
13040 }
13041 // The K and L constraints apply *only* to logical immediates, including
13042 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13043 // been removed and MOV should be used). So these constraints have to
13044 // distinguish between bit patterns that are valid 32-bit or 64-bit
13045 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13046 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13047 // versa.
13048 case 'K':
13049 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13050 break;
13051 return;
13052 case 'L':
13053 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13054 break;
13055 return;
13056 // The M and N constraints are a superset of K and L respectively, for use
13057 // with the MOV (immediate) alias. As well as the logical immediates they
13058 // also match 32 or 64-bit immediates that can be loaded either using a
13059 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13060 // (M) or 64-bit 0x1234000000000000 (N) etc.
13061 // As a note some of this code is liberally stolen from the asm parser.
13062 case 'M': {
13063 if (!isUInt<32>(CVal))
13064 return;
13065 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13066 break;
13067 if ((CVal & 0xFFFF) == CVal)
13068 break;
13069 if ((CVal & 0xFFFF0000ULL) == CVal)
13070 break;
13071 uint64_t NCVal = ~(uint32_t)CVal;
13072 if ((NCVal & 0xFFFFULL) == NCVal)
13073 break;
13074 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13075 break;
13076 return;
13077 }
13078 case 'N': {
13079 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13080 break;
13081 if ((CVal & 0xFFFFULL) == CVal)
13082 break;
13083 if ((CVal & 0xFFFF0000ULL) == CVal)
13084 break;
13085 if ((CVal & 0xFFFF00000000ULL) == CVal)
13086 break;
13087 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13088 break;
13089 uint64_t NCVal = ~CVal;
13090 if ((NCVal & 0xFFFFULL) == NCVal)
13091 break;
13092 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13093 break;
13094 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13095 break;
13096 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13097 break;
13098 return;
13099 }
13100 default:
13101 return;
13102 }
13103
13104 // All assembler immediates are 64-bit integers.
13105 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13106 break;
13107 }
13108
13109 if (Result.getNode()) {
13110 Ops.push_back(Result);
13111 return;
13112 }
13113
13114 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13115}
13116
13117//===----------------------------------------------------------------------===//
13118// AArch64 Advanced SIMD Support
13119//===----------------------------------------------------------------------===//
13120
13121/// WidenVector - Given a value in the V64 register class, produce the
13122/// equivalent value in the V128 register class.
13124 EVT VT = V64Reg.getValueType();
13125 unsigned NarrowSize = VT.getVectorNumElements();
13126 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13127 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13128 SDLoc DL(V64Reg);
13129
13130 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13131 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13132}
13133
13134/// getExtFactor - Determine the adjustment factor for the position when
13135/// generating an "extract from vector registers" instruction.
13136static unsigned getExtFactor(SDValue &V) {
13137 EVT EltType = V.getValueType().getVectorElementType();
13138 return EltType.getSizeInBits() / 8;
13139}
13140
13141// Check if a vector is built from one vector via extracted elements of
13142// another together with an AND mask, ensuring that all elements fit
13143// within range. This can be reconstructed using AND and NEON's TBL1.
13145 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13146 SDLoc DL(Op);
13147 EVT VT = Op.getValueType();
13148 assert(!VT.isScalableVector() &&
13149 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13150
13151 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13152 // directly to TBL1.
13153 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13154 return SDValue();
13155
13156 unsigned NumElts = VT.getVectorNumElements();
13157 assert((NumElts == 8 || NumElts == 16) &&
13158 "Need to have exactly 8 or 16 elements in vector.");
13159
13160 SDValue SourceVec;
13161 SDValue MaskSourceVec;
13162 SmallVector<SDValue, 16> AndMaskConstants;
13163
13164 for (unsigned i = 0; i < NumElts; ++i) {
13165 SDValue V = Op.getOperand(i);
13166 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13167 return SDValue();
13168
13169 SDValue OperandSourceVec = V.getOperand(0);
13170 if (!SourceVec)
13171 SourceVec = OperandSourceVec;
13172 else if (SourceVec != OperandSourceVec)
13173 return SDValue();
13174
13175 // This only looks at shuffles with elements that are
13176 // a) truncated by a constant AND mask extracted from a mask vector, or
13177 // b) extracted directly from a mask vector.
13178 SDValue MaskSource = V.getOperand(1);
13179 if (MaskSource.getOpcode() == ISD::AND) {
13180 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13181 return SDValue();
13182
13183 AndMaskConstants.push_back(MaskSource.getOperand(1));
13184 MaskSource = MaskSource->getOperand(0);
13185 } else if (!AndMaskConstants.empty()) {
13186 // Either all or no operands should have an AND mask.
13187 return SDValue();
13188 }
13189
13190 // An ANY_EXTEND may be inserted between the AND and the source vector
13191 // extraction. We don't care about that, so we can just skip it.
13192 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13193 MaskSource = MaskSource.getOperand(0);
13194
13195 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13196 return SDValue();
13197
13198 SDValue MaskIdx = MaskSource.getOperand(1);
13199 if (!isa<ConstantSDNode>(MaskIdx) ||
13200 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13201 return SDValue();
13202
13203 // We only apply this if all elements come from the same vector with the
13204 // same vector type.
13205 if (!MaskSourceVec) {
13206 MaskSourceVec = MaskSource->getOperand(0);
13207 if (MaskSourceVec.getValueType() != VT)
13208 return SDValue();
13209 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13210 return SDValue();
13211 }
13212 }
13213
13214 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13215 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13216 // insert, we know that the index in the mask must be smaller than the number
13217 // of elements in the source, or we would have an out-of-bounds access.
13218 if (NumElts == 8)
13219 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13220 DAG.getUNDEF(VT));
13221
13222 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13223 if (!AndMaskConstants.empty())
13224 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13225 DAG.getBuildVector(VT, DL, AndMaskConstants));
13226
13227 return DAG.getNode(
13229 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,
13230 MaskSourceVec);
13231}
13232
13233// Gather data to see if the operation can be modelled as a
13234// shuffle in combination with VEXTs.
13236 SelectionDAG &DAG) const {
13237 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13238 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13239 SDLoc DL(Op);
13240 EVT VT = Op.getValueType();
13241 assert(!VT.isScalableVector() &&
13242 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13243 unsigned NumElts = VT.getVectorNumElements();
13244
13245 struct ShuffleSourceInfo {
13246 SDValue Vec;
13247 unsigned MinElt;
13248 unsigned MaxElt;
13249
13250 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13251 // be compatible with the shuffle we intend to construct. As a result
13252 // ShuffleVec will be some sliding window into the original Vec.
13253 SDValue ShuffleVec;
13254
13255 // Code should guarantee that element i in Vec starts at element "WindowBase
13256 // + i * WindowScale in ShuffleVec".
13257 int WindowBase;
13258 int WindowScale;
13259
13260 ShuffleSourceInfo(SDValue Vec)
13261 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13262 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13263
13264 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13265 };
13266
13267 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13268 // node.
13270 for (unsigned i = 0; i < NumElts; ++i) {
13271 SDValue V = Op.getOperand(i);
13272 if (V.isUndef())
13273 continue;
13274 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13275 !isa<ConstantSDNode>(V.getOperand(1)) ||
13276 V.getOperand(0).getValueType().isScalableVector()) {
13277 LLVM_DEBUG(
13278 dbgs() << "Reshuffle failed: "
13279 "a shuffle can only come from building a vector from "
13280 "various elements of other fixed-width vectors, provided "
13281 "their indices are constant\n");
13282 return SDValue();
13283 }
13284
13285 // Add this element source to the list if it's not already there.
13286 SDValue SourceVec = V.getOperand(0);
13287 auto Source = find(Sources, SourceVec);
13288 if (Source == Sources.end())
13289 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13290
13291 // Update the minimum and maximum lane number seen.
13292 unsigned EltNo = V.getConstantOperandVal(1);
13293 Source->MinElt = std::min(Source->MinElt, EltNo);
13294 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13295 }
13296
13297 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13298 // better than moving to/from gpr registers for larger vectors.
13299 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13300 // Construct a mask for the tbl. We may need to adjust the index for types
13301 // larger than i8.
13303 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13304 for (unsigned I = 0; I < NumElts; ++I) {
13305 SDValue V = Op.getOperand(I);
13306 if (V.isUndef()) {
13307 for (unsigned OF = 0; OF < OutputFactor; OF++)
13308 Mask.push_back(-1);
13309 continue;
13310 }
13311 // Set the Mask lanes adjusted for the size of the input and output
13312 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13313 // output element, adjusted in their positions per input and output types.
13314 unsigned Lane = V.getConstantOperandVal(1);
13315 for (unsigned S = 0; S < Sources.size(); S++) {
13316 if (V.getOperand(0) == Sources[S].Vec) {
13317 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13318 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13319 for (unsigned OF = 0; OF < OutputFactor; OF++)
13320 Mask.push_back(InputBase + OF);
13321 break;
13322 }
13323 }
13324 }
13325
13326 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13327 // v16i8, and the TBLMask
13328 SmallVector<SDValue, 16> TBLOperands;
13329 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13330 ? Intrinsic::aarch64_neon_tbl3
13331 : Intrinsic::aarch64_neon_tbl4,
13332 DL, MVT::i32));
13333 for (unsigned i = 0; i < Sources.size(); i++) {
13334 SDValue Src = Sources[i].Vec;
13335 EVT SrcVT = Src.getValueType();
13336 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13337 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13338 "Expected a legally typed vector");
13339 if (SrcVT.is64BitVector())
13340 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13341 DAG.getUNDEF(MVT::v8i8));
13342 TBLOperands.push_back(Src);
13343 }
13344
13346 for (unsigned i = 0; i < Mask.size(); i++)
13347 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13348 assert((Mask.size() == 8 || Mask.size() == 16) &&
13349 "Expected a v8i8 or v16i8 Mask");
13350 TBLOperands.push_back(DAG.getBuildVector(
13351 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13352
13353 SDValue Shuffle =
13355 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13356 return DAG.getBitcast(VT, Shuffle);
13357 }
13358
13359 if (Sources.size() > 2) {
13360 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13361 << "sensible when at most two source vectors are "
13362 << "involved\n");
13363 return SDValue();
13364 }
13365
13366 // Find out the smallest element size among result and two sources, and use
13367 // it as element size to build the shuffle_vector.
13368 EVT SmallestEltTy = VT.getVectorElementType();
13369 for (auto &Source : Sources) {
13370 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13371 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13372 SmallestEltTy = SrcEltTy;
13373 }
13374 }
13375 unsigned ResMultiplier =
13376 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13377 uint64_t VTSize = VT.getFixedSizeInBits();
13378 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13379 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13380
13381 // If the source vector is too wide or too narrow, we may nevertheless be able
13382 // to construct a compatible shuffle either by concatenating it with UNDEF or
13383 // extracting a suitable range of elements.
13384 for (auto &Src : Sources) {
13385 EVT SrcVT = Src.ShuffleVec.getValueType();
13386
13387 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13388 if (SrcVTSize == TypeSize::getFixed(VTSize))
13389 continue;
13390
13391 // This stage of the search produces a source with the same element type as
13392 // the original, but with a total width matching the BUILD_VECTOR output.
13393 EVT EltVT = SrcVT.getVectorElementType();
13394 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13395 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13396
13397 if (SrcVTSize.getFixedValue() < VTSize) {
13398 assert(2 * SrcVTSize == VTSize);
13399 // We can pad out the smaller vector for free, so if it's part of a
13400 // shuffle...
13401 Src.ShuffleVec =
13402 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13403 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13404 continue;
13405 }
13406
13407 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13408 LLVM_DEBUG(
13409 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13410 return SDValue();
13411 }
13412
13413 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13414 LLVM_DEBUG(
13415 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13416 return SDValue();
13417 }
13418
13419 if (Src.MinElt >= NumSrcElts) {
13420 // The extraction can just take the second half
13421 Src.ShuffleVec =
13422 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13423 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13424 Src.WindowBase = -NumSrcElts;
13425 } else if (Src.MaxElt < NumSrcElts) {
13426 // The extraction can just take the first half
13427 Src.ShuffleVec =
13428 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13429 DAG.getConstant(0, DL, MVT::i64));
13430 } else {
13431 // An actual VEXT is needed
13432 SDValue VEXTSrc1 =
13433 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13434 DAG.getConstant(0, DL, MVT::i64));
13435 SDValue VEXTSrc2 =
13436 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13437 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13438 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13439
13440 if (!SrcVT.is64BitVector()) {
13441 LLVM_DEBUG(
13442 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13443 "for SVE vectors.");
13444 return SDValue();
13445 }
13446
13447 Src.ShuffleVec =
13448 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13449 DAG.getConstant(Imm, DL, MVT::i32));
13450 Src.WindowBase = -Src.MinElt;
13451 }
13452 }
13453
13454 // Another possible incompatibility occurs from the vector element types. We
13455 // can fix this by bitcasting the source vectors to the same type we intend
13456 // for the shuffle.
13457 for (auto &Src : Sources) {
13458 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13459 if (SrcEltTy == SmallestEltTy)
13460 continue;
13461 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13462 if (DAG.getDataLayout().isBigEndian()) {
13463 Src.ShuffleVec =
13464 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
13465 } else {
13466 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
13467 }
13468 Src.WindowScale =
13469 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13470 Src.WindowBase *= Src.WindowScale;
13471 }
13472
13473 // Final check before we try to actually produce a shuffle.
13474 LLVM_DEBUG({
13475 for (auto Src : Sources)
13476 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13477 });
13478
13479 // The stars all align, our next step is to produce the mask for the shuffle.
13480 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13481 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13482 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13483 SDValue Entry = Op.getOperand(i);
13484 if (Entry.isUndef())
13485 continue;
13486
13487 auto Src = find(Sources, Entry.getOperand(0));
13488 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13489
13490 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13491 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13492 // segment.
13493 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13494 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13495 VT.getScalarSizeInBits());
13496 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13497
13498 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13499 // starting at the appropriate offset.
13500 int *LaneMask = &Mask[i * ResMultiplier];
13501
13502 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13503 ExtractBase += NumElts * (Src - Sources.begin());
13504 for (int j = 0; j < LanesDefined; ++j)
13505 LaneMask[j] = ExtractBase + j;
13506 }
13507
13508 // Final check before we try to produce nonsense...
13509 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13510 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13511 return SDValue();
13512 }
13513
13514 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13515 for (unsigned i = 0; i < Sources.size(); ++i)
13516 ShuffleOps[i] = Sources[i].ShuffleVec;
13517
13518 SDValue Shuffle =
13519 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
13520 SDValue V;
13521 if (DAG.getDataLayout().isBigEndian()) {
13522 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
13523 } else {
13524 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
13525 }
13526
13527 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13528 dbgs() << "Reshuffle, creating node: "; V.dump(););
13529
13530 return V;
13531}
13532
13533// check if an EXT instruction can handle the shuffle mask when the
13534// vector sources of the shuffle are the same.
13535static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13536 unsigned NumElts = VT.getVectorNumElements();
13537
13538 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13539 if (M[0] < 0)
13540 return false;
13541
13542 Imm = M[0];
13543
13544 // If this is a VEXT shuffle, the immediate value is the index of the first
13545 // element. The other shuffle indices must be the successive elements after
13546 // the first one.
13547 unsigned ExpectedElt = Imm;
13548 for (unsigned i = 1; i < NumElts; ++i) {
13549 // Increment the expected index. If it wraps around, just follow it
13550 // back to index zero and keep going.
13551 ++ExpectedElt;
13552 if (ExpectedElt == NumElts)
13553 ExpectedElt = 0;
13554
13555 if (M[i] < 0)
13556 continue; // ignore UNDEF indices
13557 if (ExpectedElt != static_cast<unsigned>(M[i]))
13558 return false;
13559 }
13560
13561 return true;
13562}
13563
13564// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13565// v4i32s. This is really a truncate, which we can construct out of (legal)
13566// concats and truncate nodes.
13568 if (V.getValueType() != MVT::v16i8)
13569 return SDValue();
13570 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13571
13572 for (unsigned X = 0; X < 4; X++) {
13573 // Check the first item in each group is an extract from lane 0 of a v4i32
13574 // or v4i16.
13575 SDValue BaseExt = V.getOperand(X * 4);
13576 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13577 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13578 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13579 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13580 BaseExt.getConstantOperandVal(1) != 0)
13581 return SDValue();
13582 SDValue Base = BaseExt.getOperand(0);
13583 // And check the other items are extracts from the same vector.
13584 for (unsigned Y = 1; Y < 4; Y++) {
13585 SDValue Ext = V.getOperand(X * 4 + Y);
13586 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13587 Ext.getOperand(0) != Base ||
13588 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13589 Ext.getConstantOperandVal(1) != Y)
13590 return SDValue();
13591 }
13592 }
13593
13594 // Turn the buildvector into a series of truncates and concates, which will
13595 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13596 // concat together to produce 2 v8i16. These are both truncated and concat
13597 // together.
13598 SDLoc DL(V);
13599 SDValue Trunc[4] = {
13600 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13601 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13602 for (SDValue &V : Trunc)
13603 if (V.getValueType() == MVT::v4i32)
13604 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13605 SDValue Concat0 =
13606 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13607 SDValue Concat1 =
13608 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13609 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13610 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13611 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13612}
13613
13614/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13615/// element width than the vector lane type. If that is the case the function
13616/// returns true and writes the value of the DUP instruction lane operand into
13617/// DupLaneOp
13618static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13619 unsigned &DupLaneOp) {
13620 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13621 "Only possible block sizes for wide DUP are: 16, 32, 64");
13622
13623 if (BlockSize <= VT.getScalarSizeInBits())
13624 return false;
13625 if (BlockSize % VT.getScalarSizeInBits() != 0)
13626 return false;
13627 if (VT.getSizeInBits() % BlockSize != 0)
13628 return false;
13629
13630 size_t SingleVecNumElements = VT.getVectorNumElements();
13631 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13632 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13633
13634 // We are looking for masks like
13635 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13636 // might be replaced by 'undefined'. BlockIndices will eventually contain
13637 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13638 // for the above examples)
13639 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13640 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13641 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13642 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13643 if (Elt < 0)
13644 continue;
13645 // For now we don't support shuffles that use the second operand
13646 if ((unsigned)Elt >= SingleVecNumElements)
13647 return false;
13648 if (BlockElts[I] < 0)
13649 BlockElts[I] = Elt;
13650 else if (BlockElts[I] != Elt)
13651 return false;
13652 }
13653
13654 // We found a candidate block (possibly with some undefs). It must be a
13655 // sequence of consecutive integers starting with a value divisible by
13656 // NumEltsPerBlock with some values possibly replaced by undef-s.
13657
13658 // Find first non-undef element
13659 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13660 assert(FirstRealEltIter != BlockElts.end() &&
13661 "Shuffle with all-undefs must have been caught by previous cases, "
13662 "e.g. isSplat()");
13663 if (FirstRealEltIter == BlockElts.end()) {
13664 DupLaneOp = 0;
13665 return true;
13666 }
13667
13668 // Index of FirstRealElt in BlockElts
13669 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13670
13671 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13672 return false;
13673 // BlockElts[0] must have the following value if it isn't undef:
13674 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13675
13676 // Check the first element
13677 if (Elt0 % NumEltsPerBlock != 0)
13678 return false;
13679 // Check that the sequence indeed consists of consecutive integers (modulo
13680 // undefs)
13681 for (size_t I = 0; I < NumEltsPerBlock; I++)
13682 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13683 return false;
13684
13685 DupLaneOp = Elt0 / NumEltsPerBlock;
13686 return true;
13687}
13688
13689// check if an EXT instruction can handle the shuffle mask when the
13690// vector sources of the shuffle are different.
13691static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13692 unsigned &Imm) {
13693 // Look for the first non-undef element.
13694 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13695
13696 // Benefit from APInt to handle overflow when calculating expected element.
13697 unsigned NumElts = VT.getVectorNumElements();
13698 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13699 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13700 /*implicitTrunc=*/true);
13701 // The following shuffle indices must be the successive elements after the
13702 // first real element.
13703 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13704 return Elt != ExpectedElt++ && Elt >= 0;
13705 });
13706 if (FoundWrongElt)
13707 return false;
13708
13709 // The index of an EXT is the first element if it is not UNDEF.
13710 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13711 // value of the first element. E.g.
13712 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13713 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13714 // ExpectedElt is the last mask index plus 1.
13715 Imm = ExpectedElt.getZExtValue();
13716
13717 // There are two difference cases requiring to reverse input vectors.
13718 // For example, for vector <4 x i32> we have the following cases,
13719 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13720 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13721 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13722 // to reverse two input vectors.
13723 if (Imm < NumElts)
13724 ReverseEXT = true;
13725 else
13726 Imm -= NumElts;
13727
13728 return true;
13729}
13730
13731/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13732/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13733/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13734static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13735 unsigned NumElts = VT.getVectorNumElements();
13736 if (NumElts % 2 != 0)
13737 return false;
13738 WhichResult = (M[0] == 0 ? 0 : 1);
13739 unsigned Idx = WhichResult * NumElts / 2;
13740 for (unsigned i = 0; i != NumElts; i += 2) {
13741 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13742 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13743 return false;
13744 Idx += 1;
13745 }
13746
13747 return true;
13748}
13749
13750/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13751/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13752/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13753static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13754 unsigned Half = VT.getVectorNumElements() / 2;
13755 WhichResult = (M[0] == 0 ? 0 : 1);
13756 for (unsigned j = 0; j != 2; ++j) {
13757 unsigned Idx = WhichResult;
13758 for (unsigned i = 0; i != Half; ++i) {
13759 int MIdx = M[i + j * Half];
13760 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13761 return false;
13762 Idx += 2;
13763 }
13764 }
13765
13766 return true;
13767}
13768
13769/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13770/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13771/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13772static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13773 unsigned NumElts = VT.getVectorNumElements();
13774 if (NumElts % 2 != 0)
13775 return false;
13776 WhichResult = (M[0] == 0 ? 0 : 1);
13777 for (unsigned i = 0; i < NumElts; i += 2) {
13778 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13779 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13780 return false;
13781 }
13782 return true;
13783}
13784
13785static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13786 bool &DstIsLeft, int &Anomaly) {
13787 if (M.size() != static_cast<size_t>(NumInputElements))
13788 return false;
13789
13790 int NumLHSMatch = 0, NumRHSMatch = 0;
13791 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13792
13793 for (int i = 0; i < NumInputElements; ++i) {
13794 if (M[i] == -1) {
13795 ++NumLHSMatch;
13796 ++NumRHSMatch;
13797 continue;
13798 }
13799
13800 if (M[i] == i)
13801 ++NumLHSMatch;
13802 else
13803 LastLHSMismatch = i;
13804
13805 if (M[i] == i + NumInputElements)
13806 ++NumRHSMatch;
13807 else
13808 LastRHSMismatch = i;
13809 }
13810
13811 if (NumLHSMatch == NumInputElements - 1) {
13812 DstIsLeft = true;
13813 Anomaly = LastLHSMismatch;
13814 return true;
13815 } else if (NumRHSMatch == NumInputElements - 1) {
13816 DstIsLeft = false;
13817 Anomaly = LastRHSMismatch;
13818 return true;
13819 }
13820
13821 return false;
13822}
13823
13824static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13825 if (VT.getSizeInBits() != 128)
13826 return false;
13827
13828 unsigned NumElts = VT.getVectorNumElements();
13829
13830 for (int I = 0, E = NumElts / 2; I != E; I++) {
13831 if (Mask[I] != I)
13832 return false;
13833 }
13834
13835 int Offset = NumElts / 2;
13836 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13837 if (Mask[I] != I + SplitLHS * Offset)
13838 return false;
13839 }
13840
13841 return true;
13842}
13843
13845 SDLoc DL(Op);
13846 EVT VT = Op.getValueType();
13847 SDValue V0 = Op.getOperand(0);
13848 SDValue V1 = Op.getOperand(1);
13849 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13850
13853 return SDValue();
13854
13855 bool SplitV0 = V0.getValueSizeInBits() == 128;
13856
13857 if (!isConcatMask(Mask, VT, SplitV0))
13858 return SDValue();
13859
13860 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13861 if (SplitV0) {
13862 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
13863 DAG.getConstant(0, DL, MVT::i64));
13864 }
13865 if (V1.getValueSizeInBits() == 128) {
13866 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
13867 DAG.getConstant(0, DL, MVT::i64));
13868 }
13869 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
13870}
13871
13872/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13873/// the specified operations to build the shuffle. ID is the perfect-shuffle
13874//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13875//table entry and LHS/RHS are the immediate inputs for this stage of the
13876//shuffle.
13878 unsigned PFEntry, SDValue LHS,
13879 SDValue RHS, SelectionDAG &DAG,
13880 const SDLoc &DL) {
13881 unsigned OpNum = (PFEntry >> 26) & 0x0F;
13882 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13883 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13884
13885 enum {
13886 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13887 OP_VREV,
13888 OP_VDUP0,
13889 OP_VDUP1,
13890 OP_VDUP2,
13891 OP_VDUP3,
13892 OP_VEXT1,
13893 OP_VEXT2,
13894 OP_VEXT3,
13895 OP_VUZPL, // VUZP, left result
13896 OP_VUZPR, // VUZP, right result
13897 OP_VZIPL, // VZIP, left result
13898 OP_VZIPR, // VZIP, right result
13899 OP_VTRNL, // VTRN, left result
13900 OP_VTRNR, // VTRN, right result
13901 OP_MOVLANE // Move lane. RHSID is the lane to move into
13902 };
13903
13904 if (OpNum == OP_COPY) {
13905 if (LHSID == (1 * 9 + 2) * 9 + 3)
13906 return LHS;
13907 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13908 return RHS;
13909 }
13910
13911 if (OpNum == OP_MOVLANE) {
13912 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13913 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13914 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13915 Elt = 3 - Elt;
13916 while (Elt > 0) {
13917 ID /= 9;
13918 Elt--;
13919 }
13920 return (ID % 9 == 8) ? -1 : ID % 9;
13921 };
13922
13923 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13924 // get the lane to move from the PFID, which is always from the
13925 // original vectors (V1 or V2).
13927 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
13928 EVT VT = OpLHS.getValueType();
13929 assert(RHSID < 8 && "Expected a lane index for RHSID!");
13930 unsigned ExtLane = 0;
13931 SDValue Input;
13932
13933 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13934 // convert into a higher type.
13935 if (RHSID & 0x4) {
13936 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13937 if (MaskElt == -1)
13938 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13939 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13940 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13941 Input = MaskElt < 2 ? V1 : V2;
13942 if (VT.getScalarSizeInBits() == 16) {
13943 Input = DAG.getBitcast(MVT::v2f32, Input);
13944 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
13945 } else {
13946 assert(VT.getScalarSizeInBits() == 32 &&
13947 "Expected 16 or 32 bit shuffle elements");
13948 Input = DAG.getBitcast(MVT::v2f64, Input);
13949 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
13950 }
13951 } else {
13952 int MaskElt = getPFIDLane(ID, RHSID);
13953 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13954 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13955 Input = MaskElt < 4 ? V1 : V2;
13956 // Be careful about creating illegal types. Use f16 instead of i16.
13957 if (VT == MVT::v4i16) {
13958 Input = DAG.getBitcast(MVT::v4f16, Input);
13959 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
13960 }
13961 }
13964 Input, DAG.getVectorIdxConstant(ExtLane, DL));
13965 SDValue Ins =
13966 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
13967 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
13968 return DAG.getBitcast(VT, Ins);
13969 }
13970
13971 SDValue OpLHS, OpRHS;
13972 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
13973 RHS, DAG, DL);
13974 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
13975 RHS, DAG, DL);
13976 EVT VT = OpLHS.getValueType();
13977
13978 switch (OpNum) {
13979 default:
13980 llvm_unreachable("Unknown shuffle opcode!");
13981 case OP_VREV:
13982 // VREV divides the vector in half and swaps within the half.
13983 if (VT.getVectorElementType() == MVT::i32 ||
13984 VT.getVectorElementType() == MVT::f32)
13985 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
13986 // vrev <4 x i16> -> REV32
13987 if (VT.getVectorElementType() == MVT::i16 ||
13988 VT.getVectorElementType() == MVT::f16 ||
13989 VT.getVectorElementType() == MVT::bf16)
13990 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
13991 // vrev <4 x i8> -> REV16
13992 assert(VT.getVectorElementType() == MVT::i8);
13993 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
13994 case OP_VDUP0:
13995 case OP_VDUP1:
13996 case OP_VDUP2:
13997 case OP_VDUP3: {
13998 EVT EltTy = VT.getVectorElementType();
13999 unsigned Opcode;
14000 if (EltTy == MVT::i8)
14001 Opcode = AArch64ISD::DUPLANE8;
14002 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14003 Opcode = AArch64ISD::DUPLANE16;
14004 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14005 Opcode = AArch64ISD::DUPLANE32;
14006 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14007 Opcode = AArch64ISD::DUPLANE64;
14008 else
14009 llvm_unreachable("Invalid vector element type?");
14010
14011 if (VT.getSizeInBits() == 64)
14012 OpLHS = WidenVector(OpLHS, DAG);
14013 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14014 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14015 }
14016 case OP_VEXT1:
14017 case OP_VEXT2:
14018 case OP_VEXT3: {
14019 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14020 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14021 DAG.getConstant(Imm, DL, MVT::i32));
14022 }
14023 case OP_VUZPL:
14024 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14025 case OP_VUZPR:
14026 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14027 case OP_VZIPL:
14028 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14029 case OP_VZIPR:
14030 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14031 case OP_VTRNL:
14032 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14033 case OP_VTRNR:
14034 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14035 }
14036}
14037
14039 SelectionDAG &DAG) {
14040 // Check to see if we can use the TBL instruction.
14041 SDValue V1 = Op.getOperand(0);
14042 SDValue V2 = Op.getOperand(1);
14043 SDLoc DL(Op);
14044
14045 EVT EltVT = Op.getValueType().getVectorElementType();
14046 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14047
14048 bool Swap = false;
14049 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14050 std::swap(V1, V2);
14051 Swap = true;
14052 }
14053
14054 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14055 // out of range values with 0s. We do need to make sure that any out-of-range
14056 // values are really out-of-range for a v16i8 vector.
14057 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14058 MVT IndexVT = MVT::v8i8;
14059 unsigned IndexLen = 8;
14060 if (Op.getValueSizeInBits() == 128) {
14061 IndexVT = MVT::v16i8;
14062 IndexLen = 16;
14063 }
14064
14066 for (int Val : ShuffleMask) {
14067 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14068 unsigned Offset = Byte + Val * BytesPerElt;
14069 if (Swap)
14070 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14071 if (IsUndefOrZero && Offset >= IndexLen)
14072 Offset = 255;
14073 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14074 }
14075 }
14076
14077 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14078 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14079
14080 SDValue Shuffle;
14081 if (IsUndefOrZero) {
14082 if (IndexLen == 8)
14083 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14084 Shuffle = DAG.getNode(
14085 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14086 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14087 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14088 } else {
14089 if (IndexLen == 8) {
14090 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14091 Shuffle = DAG.getNode(
14092 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14093 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14094 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14095 } else {
14096 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14097 // cannot currently represent the register constraints on the input
14098 // table registers.
14099 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14100 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14101 // IndexLen));
14102 Shuffle = DAG.getNode(
14103 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14104 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
14105 V2Cst,
14106 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14107 }
14108 }
14109 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14110}
14111
14112static unsigned getDUPLANEOp(EVT EltType) {
14113 if (EltType == MVT::i8)
14114 return AArch64ISD::DUPLANE8;
14115 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14116 return AArch64ISD::DUPLANE16;
14117 if (EltType == MVT::i32 || EltType == MVT::f32)
14118 return AArch64ISD::DUPLANE32;
14119 if (EltType == MVT::i64 || EltType == MVT::f64)
14120 return AArch64ISD::DUPLANE64;
14121
14122 llvm_unreachable("Invalid vector element type?");
14123}
14124
14125static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14126 unsigned Opcode, SelectionDAG &DAG) {
14127 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14128 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14129 // Match: dup (bitcast (extract_subv X, C)), LaneC
14130 if (BitCast.getOpcode() != ISD::BITCAST ||
14132 return false;
14133
14134 // The extract index must align in the destination type. That may not
14135 // happen if the bitcast is from narrow to wide type.
14136 SDValue Extract = BitCast.getOperand(0);
14137 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14138 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14139 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14140 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14141 if (ExtIdxInBits % CastedEltBitWidth != 0)
14142 return false;
14143
14144 // Can't handle cases where vector size is not 128-bit
14145 if (!Extract.getOperand(0).getValueType().is128BitVector())
14146 return false;
14147
14148 // Update the lane value by offsetting with the scaled extract index.
14149 LaneC += ExtIdxInBits / CastedEltBitWidth;
14150
14151 // Determine the casted vector type of the wide vector input.
14152 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14153 // Examples:
14154 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14155 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14156 unsigned SrcVecNumElts =
14157 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14159 SrcVecNumElts);
14160 return true;
14161 };
14162 MVT CastVT;
14163 if (getScaledOffsetDup(V, Lane, CastVT)) {
14164 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14165 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14166 V.getOperand(0).getValueType().is128BitVector()) {
14167 // The lane is incremented by the index of the extract.
14168 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14169 Lane += V.getConstantOperandVal(1);
14170 V = V.getOperand(0);
14171 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14172 // The lane is decremented if we are splatting from the 2nd operand.
14173 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14174 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14175 Lane -= Idx * VT.getVectorNumElements() / 2;
14176 V = WidenVector(V.getOperand(Idx), DAG);
14177 } else if (VT.getSizeInBits() == 64) {
14178 // Widen the operand to 128-bit register with undef.
14179 V = WidenVector(V, DAG);
14180 }
14181 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14182}
14183
14184// Try to widen element type to get a new mask value for a better permutation
14185// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14186// UZP1/2, TRN1/2, REV, INS, etc.
14187// For example:
14188// shufflevector <4 x i32> %a, <4 x i32> %b,
14189// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14190// is equivalent to:
14191// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14192// Finally, we can get:
14193// mov v0.d[0], v1.d[1]
14195 SDLoc DL(Op);
14196 EVT VT = Op.getValueType();
14197 EVT ScalarVT = VT.getVectorElementType();
14198 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14199 SDValue V0 = Op.getOperand(0);
14200 SDValue V1 = Op.getOperand(1);
14201 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14202
14203 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14204 // We need to make sure the wider element type is legal. Thus, ElementSize
14205 // should be not larger than 32 bits, and i1 type should also be excluded.
14206 if (ElementSize > 32 || ElementSize == 1)
14207 return SDValue();
14208
14209 SmallVector<int, 8> NewMask;
14210 if (widenShuffleMaskElts(Mask, NewMask)) {
14211 MVT NewEltVT = VT.isFloatingPoint()
14212 ? MVT::getFloatingPointVT(ElementSize * 2)
14213 : MVT::getIntegerVT(ElementSize * 2);
14214 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14215 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14216 V0 = DAG.getBitcast(NewVT, V0);
14217 V1 = DAG.getBitcast(NewVT, V1);
14218 return DAG.getBitcast(VT,
14219 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14220 }
14221 }
14222
14223 return SDValue();
14224}
14225
14226// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14228 ArrayRef<int> ShuffleMask,
14229 SelectionDAG &DAG) {
14230 SDValue Tbl1 = Op->getOperand(0);
14231 SDValue Tbl2 = Op->getOperand(1);
14232 SDLoc DL(Op);
14233 SDValue Tbl2ID =
14234 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14235
14236 EVT VT = Op.getValueType();
14237 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14238 Tbl1.getOperand(0) != Tbl2ID ||
14240 Tbl2.getOperand(0) != Tbl2ID)
14241 return SDValue();
14242
14243 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14244 return SDValue();
14245
14246 SDValue Mask1 = Tbl1.getOperand(3);
14247 SDValue Mask2 = Tbl2.getOperand(3);
14248 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14249 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14250 return SDValue();
14251
14252 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14253 for (unsigned I = 0; I < 16; I++) {
14254 if (ShuffleMask[I] < 16)
14255 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14256 else {
14257 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14258 if (!C)
14259 return SDValue();
14260 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14261 }
14262 }
14263
14264 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14265 SDValue ID =
14266 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14267
14268 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14269 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14270 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14271}
14272
14273// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14274// but we don't have an appropriate instruction,
14275// so custom-lower it as ZIP1-with-zeros.
14276SDValue
14277AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14278 SelectionDAG &DAG) const {
14279 SDLoc DL(Op);
14280 EVT VT = Op.getValueType();
14281 SDValue SrcOp = Op.getOperand(0);
14282 EVT SrcVT = SrcOp.getValueType();
14283 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14284 "Unexpected extension factor.");
14285 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14286 // FIXME: support multi-step zipping?
14287 if (Scale != 2)
14288 return SDValue();
14289 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14290 return DAG.getBitcast(VT,
14291 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14292}
14293
14294SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14295 SelectionDAG &DAG) const {
14296 SDLoc DL(Op);
14297 EVT VT = Op.getValueType();
14298
14299 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14300
14301 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14302 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14303
14304 // Convert shuffles that are directly supported on NEON to target-specific
14305 // DAG nodes, instead of keeping them as shuffles and matching them again
14306 // during code selection. This is more efficient and avoids the possibility
14307 // of inconsistencies between legalization and selection.
14308 ArrayRef<int> ShuffleMask = SVN->getMask();
14309
14310 SDValue V1 = Op.getOperand(0);
14311 SDValue V2 = Op.getOperand(1);
14312
14313 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14314 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14315 "Unexpected VECTOR_SHUFFLE mask size!");
14316
14317 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14318 return Res;
14319
14320 if (SVN->isSplat()) {
14321 int Lane = SVN->getSplatIndex();
14322 // If this is undef splat, generate it via "just" vdup, if possible.
14323 if (Lane == -1)
14324 Lane = 0;
14325
14326 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14327 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14328 V1.getOperand(0));
14329 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14330 // constant. If so, we can just reference the lane's definition directly.
14331 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14332 !isa<ConstantSDNode>(V1.getOperand(Lane)))
14333 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14334
14335 // Otherwise, duplicate from the lane of the input vector.
14336 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14337 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14338 }
14339
14340 // Check if the mask matches a DUP for a wider element
14341 for (unsigned LaneSize : {64U, 32U, 16U}) {
14342 unsigned Lane = 0;
14343 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14344 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14345 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14346 : AArch64ISD::DUPLANE16;
14347 // Cast V1 to an integer vector with required lane size
14348 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14349 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14350 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14351 V1 = DAG.getBitcast(NewVecTy, V1);
14352 // Construct the DUP instruction
14353 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14354 // Cast back to the original type
14355 return DAG.getBitcast(VT, V1);
14356 }
14357 }
14358
14359 unsigned NumElts = VT.getVectorNumElements();
14360 unsigned EltSize = VT.getScalarSizeInBits();
14361 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14362 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14363 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14364 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14365 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14366 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14367
14368 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14369 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14370 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14371 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14372 DAG.getConstant(8, DL, MVT::i32));
14373 }
14374
14375 bool ReverseEXT = false;
14376 unsigned Imm;
14377 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14378 if (ReverseEXT)
14379 std::swap(V1, V2);
14380 Imm *= getExtFactor(V1);
14381 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14382 DAG.getConstant(Imm, DL, MVT::i32));
14383 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14384 Imm *= getExtFactor(V1);
14385 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14386 DAG.getConstant(Imm, DL, MVT::i32));
14387 }
14388
14389 unsigned WhichResult;
14390 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14391 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14392 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14393 }
14394 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14395 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14396 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14397 }
14398 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
14399 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14400 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14401 }
14402
14403 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14404 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14405 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14406 }
14407 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14408 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14409 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14410 }
14411 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14412 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14413 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14414 }
14415
14417 return Concat;
14418
14419 bool DstIsLeft;
14420 int Anomaly;
14421 int NumInputElements = V1.getValueType().getVectorNumElements();
14422 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14423 SDValue DstVec = DstIsLeft ? V1 : V2;
14424 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
14425
14426 SDValue SrcVec = V1;
14427 int SrcLane = ShuffleMask[Anomaly];
14428 if (SrcLane >= NumInputElements) {
14429 SrcVec = V2;
14430 SrcLane -= NumElts;
14431 }
14432 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
14433
14434 EVT ScalarVT = VT.getVectorElementType();
14435
14436 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14437 ScalarVT = MVT::i32;
14438
14439 return DAG.getNode(
14440 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
14441 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
14442 DstLaneV);
14443 }
14444
14445 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14446 return NewSD;
14447
14448 // If the shuffle is not directly supported and it has 4 elements, use
14449 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14450 if (NumElts == 4) {
14451 unsigned PFIndexes[4];
14452 for (unsigned i = 0; i != 4; ++i) {
14453 if (ShuffleMask[i] < 0)
14454 PFIndexes[i] = 8;
14455 else
14456 PFIndexes[i] = ShuffleMask[i];
14457 }
14458
14459 // Compute the index in the perfect shuffle table.
14460 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14461 PFIndexes[2] * 9 + PFIndexes[3];
14462 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14463 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14464 DL);
14465 }
14466
14467 // Check for a "select shuffle", generating a BSL to pick between lanes in
14468 // V1/V2.
14469 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14470 assert(VT.getScalarSizeInBits() <= 32 &&
14471 "Expected larger vector element sizes to be handled already");
14472 SmallVector<SDValue> MaskElts;
14473 for (int M : ShuffleMask)
14474 MaskElts.push_back(DAG.getConstant(
14475 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
14477 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
14478 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
14479 DAG.getBitcast(IVT, V1),
14480 DAG.getBitcast(IVT, V2)));
14481 }
14482
14483 // Fall back to generating a TBL
14484 return GenerateTBL(Op, ShuffleMask, DAG);
14485}
14486
14487SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14488 SelectionDAG &DAG) const {
14489 EVT VT = Op.getValueType();
14490
14491 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14492 return LowerToScalableOp(Op, DAG);
14493
14494 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14495 "Unexpected vector type!");
14496
14497 // We can handle the constant cases during isel.
14498 if (isa<ConstantSDNode>(Op.getOperand(0)))
14499 return Op;
14500
14501 // There isn't a natural way to handle the general i1 case, so we use some
14502 // trickery with whilelo.
14503 SDLoc DL(Op);
14504 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14505 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14506 DAG.getValueType(MVT::i1));
14507 SDValue ID =
14508 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14509 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14510 if (VT == MVT::nxv1i1)
14511 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14512 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14513 Zero, SplatVal),
14514 Zero);
14515 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14516}
14517
14518SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14519 SelectionDAG &DAG) const {
14520 SDLoc DL(Op);
14521
14522 EVT VT = Op.getValueType();
14523 if (!isTypeLegal(VT) || !VT.isScalableVector())
14524 return SDValue();
14525
14526 // Current lowering only supports the SVE-ACLE types.
14528 return SDValue();
14529
14530 // The DUPQ operation is independent of element type so normalise to i64s.
14531 SDValue Idx128 = Op.getOperand(2);
14532
14533 // DUPQ can be used when idx is in range.
14534 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14535 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14536 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14537 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14538 }
14539
14540 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14541
14542 // The ACLE says this must produce the same result as:
14543 // svtbl(data, svadd_x(svptrue_b64(),
14544 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14545 // index * 2))
14546 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14547 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14548
14549 // create the vector 0,1,0,1,...
14550 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14551 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14552
14553 // create the vector idx64,idx64+1,idx64,idx64+1,...
14554 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14555 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14556 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14557
14558 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14559 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14560 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14561}
14562
14563
14564static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14565 APInt &UndefBits) {
14566 EVT VT = BVN->getValueType(0);
14567 APInt SplatBits, SplatUndef;
14568 unsigned SplatBitSize;
14569 bool HasAnyUndefs;
14570 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14571 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14572
14573 for (unsigned i = 0; i < NumSplats; ++i) {
14574 CnstBits <<= SplatBitSize;
14575 UndefBits <<= SplatBitSize;
14576 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14577 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14578 }
14579
14580 return true;
14581 }
14582
14583 return false;
14584}
14585
14586// Try 64-bit splatted SIMD immediate.
14587static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14588 const APInt &Bits) {
14589 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14590 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14591 EVT VT = Op.getValueType();
14592 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14593
14596
14597 SDLoc DL(Op);
14598 SDValue Mov =
14599 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14600 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14601 }
14602 }
14603
14604 return SDValue();
14605}
14606
14607// Try 32-bit splatted SIMD immediate.
14608static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14609 const APInt &Bits,
14610 const SDValue *LHS = nullptr) {
14611 EVT VT = Op.getValueType();
14612 if (VT.isFixedLengthVector() &&
14614 return SDValue();
14615
14616 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14617 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14618 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14619 bool isAdvSIMDModImm = false;
14620 uint64_t Shift;
14621
14622 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14624 Shift = 0;
14625 }
14626 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14628 Shift = 8;
14629 }
14630 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14632 Shift = 16;
14633 }
14634 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14636 Shift = 24;
14637 }
14638
14639 if (isAdvSIMDModImm) {
14640 SDLoc DL(Op);
14641 SDValue Mov;
14642
14643 if (LHS)
14644 Mov = DAG.getNode(NewOp, DL, MovTy,
14645 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14646 DAG.getConstant(Value, DL, MVT::i32),
14647 DAG.getConstant(Shift, DL, MVT::i32));
14648 else
14649 Mov =
14650 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14651 DAG.getConstant(Shift, DL, MVT::i32));
14652
14653 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14654 }
14655 }
14656
14657 return SDValue();
14658}
14659
14660// Try 16-bit splatted SIMD immediate.
14661static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14662 const APInt &Bits,
14663 const SDValue *LHS = nullptr) {
14664 EVT VT = Op.getValueType();
14665 if (VT.isFixedLengthVector() &&
14667 return SDValue();
14668
14669 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14670 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14671 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14672 bool isAdvSIMDModImm = false;
14673 uint64_t Shift;
14674
14675 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14677 Shift = 0;
14678 }
14679 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14681 Shift = 8;
14682 }
14683
14684 if (isAdvSIMDModImm) {
14685 SDLoc DL(Op);
14686 SDValue Mov;
14687
14688 if (LHS)
14689 Mov = DAG.getNode(NewOp, DL, MovTy,
14690 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14691 DAG.getConstant(Value, DL, MVT::i32),
14692 DAG.getConstant(Shift, DL, MVT::i32));
14693 else
14694 Mov =
14695 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14696 DAG.getConstant(Shift, DL, MVT::i32));
14697
14698 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14699 }
14700 }
14701
14702 return SDValue();
14703}
14704
14705// Try 32-bit splatted SIMD immediate with shifted ones.
14707 SelectionDAG &DAG, const APInt &Bits) {
14708 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14709 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14710 EVT VT = Op.getValueType();
14711 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14712 bool isAdvSIMDModImm = false;
14713 uint64_t Shift;
14714
14715 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14717 Shift = 264;
14718 }
14719 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14721 Shift = 272;
14722 }
14723
14724 if (isAdvSIMDModImm) {
14725 SDLoc DL(Op);
14726 SDValue Mov =
14727 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14728 DAG.getConstant(Shift, DL, MVT::i32));
14729 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14730 }
14731 }
14732
14733 return SDValue();
14734}
14735
14736// Try 8-bit splatted SIMD immediate.
14737static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14738 const APInt &Bits) {
14739 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14740 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14741 EVT VT = Op.getValueType();
14742 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14743
14746
14747 SDLoc DL(Op);
14748 SDValue Mov =
14749 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14750 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14751 }
14752 }
14753
14754 return SDValue();
14755}
14756
14757// Try FP splatted SIMD immediate.
14758static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14759 const APInt &Bits) {
14760 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14761 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14762 EVT VT = Op.getValueType();
14763 bool isWide = (VT.getSizeInBits() == 128);
14764 MVT MovTy;
14765 bool isAdvSIMDModImm = false;
14766
14767 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14769 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14770 }
14771 else if (isWide &&
14772 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14774 MovTy = MVT::v2f64;
14775 }
14776
14777 if (isAdvSIMDModImm) {
14778 SDLoc DL(Op);
14779 SDValue Mov =
14780 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14781 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14782 }
14783 }
14784
14785 return SDValue();
14786}
14787
14788// Specialized code to quickly find if PotentialBVec is a BuildVector that
14789// consists of only the same constant int value, returned in reference arg
14790// ConstVal
14791static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14792 uint64_t &ConstVal) {
14793 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14794 if (!Bvec)
14795 return false;
14796 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
14797 if (!FirstElt)
14798 return false;
14799 EVT VT = Bvec->getValueType(0);
14800 unsigned NumElts = VT.getVectorNumElements();
14801 for (unsigned i = 1; i < NumElts; ++i)
14802 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14803 return false;
14804 ConstVal = FirstElt->getZExtValue();
14805 return true;
14806}
14807
14809 // Look through cast.
14810 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14811 N = N.getOperand(0);
14812
14813 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14814}
14815
14817 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14818
14819 // Look through cast.
14820 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14821 N = N.getOperand(0);
14822 // When reinterpreting from a type with fewer elements the "new" elements
14823 // are not active, so bail if they're likely to be used.
14824 if (N.getValueType().getVectorMinNumElements() < NumElts)
14825 return false;
14826 }
14827
14828 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14829 return true;
14830
14831 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14832 // or smaller than the implicit element type represented by N.
14833 // NOTE: A larger element count implies a smaller element type.
14834 if (N.getOpcode() == AArch64ISD::PTRUE &&
14835 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14836 return N.getValueType().getVectorMinNumElements() >= NumElts;
14837
14838 // If we're compiling for a specific vector-length, we can check if the
14839 // pattern's VL equals that of the scalable vector at runtime.
14840 if (N.getOpcode() == AArch64ISD::PTRUE) {
14841 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14842 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14843 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14844 if (MaxSVESize && MinSVESize == MaxSVESize) {
14845 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14846 unsigned PatNumElts =
14847 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14848 return PatNumElts == (NumElts * VScale);
14849 }
14850 }
14851
14852 return false;
14853}
14854
14855// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14856// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14857// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14858// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14859// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14860// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14862 EVT VT = N->getValueType(0);
14863
14864 if (!VT.isVector())
14865 return SDValue();
14866
14867 SDLoc DL(N);
14868
14869 SDValue And;
14870 SDValue Shift;
14871
14872 SDValue FirstOp = N->getOperand(0);
14873 unsigned FirstOpc = FirstOp.getOpcode();
14874 SDValue SecondOp = N->getOperand(1);
14875 unsigned SecondOpc = SecondOp.getOpcode();
14876
14877 // Is one of the operands an AND or a BICi? The AND may have been optimised to
14878 // a BICi in order to use an immediate instead of a register.
14879 // Is the other operand an shl or lshr? This will have been turned into:
14880 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14881 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14882 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14883 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14884 SecondOpc == AArch64ISD::SHL_PRED ||
14885 SecondOpc == AArch64ISD::SRL_PRED)) {
14886 And = FirstOp;
14887 Shift = SecondOp;
14888
14889 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14890 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14891 FirstOpc == AArch64ISD::SHL_PRED ||
14892 FirstOpc == AArch64ISD::SRL_PRED)) {
14893 And = SecondOp;
14894 Shift = FirstOp;
14895 } else
14896 return SDValue();
14897
14898 bool IsAnd = And.getOpcode() == ISD::AND;
14899 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14900 Shift.getOpcode() == AArch64ISD::SRL_PRED;
14901 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14902 Shift.getOpcode() == AArch64ISD::SRL_PRED;
14903
14904 // Is the shift amount constant and are all lanes active?
14905 uint64_t C2;
14906 if (ShiftHasPredOp) {
14907 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
14908 return SDValue();
14909 APInt C;
14911 return SDValue();
14912 C2 = C.getZExtValue();
14913 } else if (ConstantSDNode *C2node =
14914 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
14915 C2 = C2node->getZExtValue();
14916 else
14917 return SDValue();
14918
14919 APInt C1AsAPInt;
14920 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14921 if (IsAnd) {
14922 // Is the and mask vector all constant?
14923 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
14924 return SDValue();
14925 } else {
14926 // Reconstruct the corresponding AND immediate from the two BICi immediates.
14927 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
14928 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
14929 assert(C1nodeImm && C1nodeShift);
14930 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14931 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
14932 }
14933
14934 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14935 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14936 // how much one can shift elements of a particular size?
14937 if (C2 > ElemSizeInBits)
14938 return SDValue();
14939
14940 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
14941 : APInt::getLowBitsSet(ElemSizeInBits, C2);
14942 if (C1AsAPInt != RequiredC1)
14943 return SDValue();
14944
14945 SDValue X = And.getOperand(0);
14946 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
14947 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
14948 : Shift.getOperand(1);
14949
14950 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14951 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
14952
14953 return ResultSLI;
14954}
14955
14957 EVT VT = N->getValueType(0);
14958 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
14959 SDLoc DL(N);
14960 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14961
14962 if (VT.isScalableVector() && !Subtarget.hasSVE2())
14963 return SDValue();
14964
14965 SDValue N0 = N->getOperand(0);
14966 if (N0.getOpcode() != ISD::AND)
14967 return SDValue();
14968
14969 SDValue N1 = N->getOperand(1);
14970 if (N1.getOpcode() != ISD::AND)
14971 return SDValue();
14972
14973 // InstCombine does (not (neg a)) => (add a -1).
14974 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
14975 // Loop over all combinations of AND operands.
14976 for (int i = 1; i >= 0; --i) {
14977 for (int j = 1; j >= 0; --j) {
14978 SDValue O0 = N0->getOperand(i);
14979 SDValue O1 = N1->getOperand(j);
14980 SDValue Sub, Add, SubSibling, AddSibling;
14981
14982 // Find a SUB and an ADD operand, one from each AND.
14983 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
14984 Sub = O0;
14985 Add = O1;
14986 SubSibling = N0->getOperand(1 - i);
14987 AddSibling = N1->getOperand(1 - j);
14988 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
14989 Add = O0;
14990 Sub = O1;
14991 AddSibling = N0->getOperand(1 - i);
14992 SubSibling = N1->getOperand(1 - j);
14993 } else
14994 continue;
14995
14996 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
14997 continue;
14998
14999 // Constant ones is always righthand operand of the Add.
15000 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15001 continue;
15002
15003 if (Sub.getOperand(1) != Add.getOperand(0))
15004 continue;
15005
15006 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15007 }
15008 }
15009
15010 // (or (and a b) (and (not a) c)) => (bsl a b c)
15011 // We only have to look for constant vectors here since the general, variable
15012 // case can be handled in TableGen.
15013 unsigned Bits = VT.getScalarSizeInBits();
15014 for (int i = 1; i >= 0; --i)
15015 for (int j = 1; j >= 0; --j) {
15016 APInt Val1, Val2;
15017
15018 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15020 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15021 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15022 N0->getOperand(1 - i), N1->getOperand(1 - j));
15023 }
15024 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
15025 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
15026 if (!BVN0 || !BVN1)
15027 continue;
15028
15029 bool FoundMatch = true;
15030 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15031 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
15032 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
15033 if (!CN0 || !CN1 ||
15034 CN0->getAPIntValue().trunc(Bits) !=
15035 ~CN1->getAsAPIntVal().trunc(Bits)) {
15036 FoundMatch = false;
15037 break;
15038 }
15039 }
15040 if (FoundMatch)
15041 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15042 N0->getOperand(1 - i), N1->getOperand(1 - j));
15043 }
15044
15045 return SDValue();
15046}
15047
15048SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15049 SelectionDAG &DAG) const {
15050 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15051 !Subtarget->isNeonAvailable()))
15052 return LowerToScalableOp(Op, DAG);
15053
15054 if (SDValue Res = tryLowerToBSL(Op, DAG))
15055 return Res;
15056
15057 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15058 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15059 return Res;
15060
15061 EVT VT = Op.getValueType();
15062 if (VT.isScalableVector())
15063 return Op;
15064
15065 SDValue LHS = Op.getOperand(0);
15066 BuildVectorSDNode *BVN =
15067 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15068 if (!BVN) {
15069 // OR commutes, so try swapping the operands.
15070 LHS = Op.getOperand(1);
15071 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15072 }
15073 if (!BVN)
15074 return Op;
15075
15076 APInt DefBits(VT.getSizeInBits(), 0);
15077 APInt UndefBits(VT.getSizeInBits(), 0);
15078 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15079 SDValue NewOp;
15080
15081 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15082 DefBits, &LHS)) ||
15083 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15084 DefBits, &LHS)))
15085 return NewOp;
15086
15087 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15088 UndefBits, &LHS)) ||
15089 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15090 UndefBits, &LHS)))
15091 return NewOp;
15092 }
15093
15094 // We can always fall back to a non-immediate OR.
15095 return Op;
15096}
15097
15098// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15099// be truncated to fit element width.
15101 SelectionDAG &DAG) {
15102 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15103 SDLoc DL(Op);
15104 EVT VT = Op.getValueType();
15105 EVT EltTy= VT.getVectorElementType();
15106
15107 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15108 return Op;
15109
15111 for (SDValue Lane : Op->ops()) {
15112 // For integer vectors, type legalization would have promoted the
15113 // operands already. Otherwise, if Op is a floating-point splat
15114 // (with operands cast to integers), then the only possibilities
15115 // are constants and UNDEFs.
15116 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15117 Lane = DAG.getConstant(
15118 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15119 DL, MVT::i32);
15120 } else if (Lane.getNode()->isUndef()) {
15121 Lane = DAG.getUNDEF(MVT::i32);
15122 } else {
15123 assert(Lane.getValueType() == MVT::i32 &&
15124 "Unexpected BUILD_VECTOR operand type");
15125 }
15126 Ops.push_back(Lane);
15127 }
15128 return DAG.getBuildVector(VT, DL, Ops);
15129}
15130
15132 const AArch64Subtarget *ST) {
15133 EVT VT = Op.getValueType();
15134 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15135 "Expected a legal NEON vector");
15136
15137 APInt DefBits(VT.getSizeInBits(), 0);
15138 APInt UndefBits(VT.getSizeInBits(), 0);
15139 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15140 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15141 auto TryMOVIWithBits = [&](APInt DefBits) {
15142 SDValue NewOp;
15143 if ((NewOp =
15144 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15145 (NewOp =
15146 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15147 (NewOp =
15148 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15149 (NewOp =
15150 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15151 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15152 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15153 return NewOp;
15154
15155 APInt NotDefBits = ~DefBits;
15156 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15157 NotDefBits)) ||
15158 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15159 NotDefBits)) ||
15160 (NewOp =
15161 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15162 return NewOp;
15163 return SDValue();
15164 };
15165 if (SDValue R = TryMOVIWithBits(DefBits))
15166 return R;
15167 if (SDValue R = TryMOVIWithBits(UndefBits))
15168 return R;
15169
15170 // See if a fneg of the constant can be materialized with a MOVI, etc
15171 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15172 // FNegate each sub-element of the constant
15173 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15174 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15175 .zext(VT.getSizeInBits());
15176 APInt NegBits(VT.getSizeInBits(), 0);
15177 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15178 for (unsigned i = 0; i < NumElts; i++)
15179 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15180 NegBits = DefBits ^ NegBits;
15181
15182 // Try to create the new constants with MOVI, and if so generate a fneg
15183 // for it.
15184 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15185 SDLoc DL(Op);
15186 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15187 return DAG.getNode(
15188 AArch64ISD::NVCAST, DL, VT,
15189 DAG.getNode(ISD::FNEG, DL, VFVT,
15190 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15191 }
15192 return SDValue();
15193 };
15194 SDValue R;
15195 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15196 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15197 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15198 return R;
15199 }
15200
15201 return SDValue();
15202}
15203
15204SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15205 SDValue Op, SelectionDAG &DAG) const {
15206 EVT VT = Op.getValueType();
15207 SDLoc DL(Op);
15208 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15209 auto *BVN = cast<BuildVectorSDNode>(Op);
15210
15211 if (auto SeqInfo = BVN->isConstantSequence()) {
15212 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15213 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15214 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15215 return convertFromScalableVector(DAG, VT, Seq);
15216 }
15217
15218 unsigned NumElems = VT.getVectorNumElements();
15219 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15220 NumElems <= 1 || BVN->isConstant())
15221 return SDValue();
15222
15223 auto IsExtractElt = [](SDValue Op) {
15224 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15225 };
15226
15227 // For integer types that are not already in vectors limit to at most four
15228 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15229 if (VT.getScalarType().isInteger() &&
15230 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15231 return SDValue();
15232
15233 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15234 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15235 SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
15236 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15237 return Op.isUndef() ? Undef
15238 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15239 ContainerVT, Undef, Op, ZeroI64);
15240 });
15241
15242 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15243 while (Intermediates.size() > 1) {
15244 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15245
15246 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15247 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15248 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15249 Intermediates[I / 2] =
15250 Op1.isUndef() ? Op0
15251 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15252 }
15253
15254 Intermediates.resize(Intermediates.size() / 2);
15255 ZipEC = ZipEC.divideCoefficientBy(2);
15256 }
15257
15258 assert(Intermediates.size() == 1);
15259 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15260 return convertFromScalableVector(DAG, VT, Vec);
15261}
15262
15263SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15264 SelectionDAG &DAG) const {
15265 EVT VT = Op.getValueType();
15266
15267 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15268 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15269 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15270 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15271
15272 // Try to build a simple constant vector.
15273 Op = NormalizeBuildVector(Op, DAG);
15274 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15275 // abort.
15276 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15277 return SDValue();
15278
15279 // Certain vector constants, used to express things like logical NOT and
15280 // arithmetic NEG, are passed through unmodified. This allows special
15281 // patterns for these operations to match, which will lower these constants
15282 // to whatever is proven necessary.
15283 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15284 if (BVN->isConstant()) {
15285 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15286 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15287 APInt Val(BitSize,
15288 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15289 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15290 return Op;
15291 }
15292 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15293 if (Const->isZero() && !Const->isNegative())
15294 return Op;
15295 }
15296
15297 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15298 return V;
15299
15300 // Scan through the operands to find some interesting properties we can
15301 // exploit:
15302 // 1) If only one value is used, we can use a DUP, or
15303 // 2) if only the low element is not undef, we can just insert that, or
15304 // 3) if only one constant value is used (w/ some non-constant lanes),
15305 // we can splat the constant value into the whole vector then fill
15306 // in the non-constant lanes.
15307 // 4) FIXME: If different constant values are used, but we can intelligently
15308 // select the values we'll be overwriting for the non-constant
15309 // lanes such that we can directly materialize the vector
15310 // some other way (MOVI, e.g.), we can be sneaky.
15311 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15312 SDLoc DL(Op);
15313 unsigned NumElts = VT.getVectorNumElements();
15314 bool isOnlyLowElement = true;
15315 bool usesOnlyOneValue = true;
15316 bool usesOnlyOneConstantValue = true;
15317 bool isConstant = true;
15318 bool AllLanesExtractElt = true;
15319 unsigned NumConstantLanes = 0;
15320 unsigned NumDifferentLanes = 0;
15321 unsigned NumUndefLanes = 0;
15322 SDValue Value;
15323 SDValue ConstantValue;
15324 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15325 unsigned ConsecutiveValCount = 0;
15326 SDValue PrevVal;
15327 for (unsigned i = 0; i < NumElts; ++i) {
15328 SDValue V = Op.getOperand(i);
15329 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15330 AllLanesExtractElt = false;
15331 if (V.isUndef()) {
15332 ++NumUndefLanes;
15333 continue;
15334 }
15335 if (i > 0)
15336 isOnlyLowElement = false;
15337 if (!isIntOrFPConstant(V))
15338 isConstant = false;
15339
15340 if (isIntOrFPConstant(V)) {
15341 ++NumConstantLanes;
15342 if (!ConstantValue.getNode())
15343 ConstantValue = V;
15344 else if (ConstantValue != V)
15345 usesOnlyOneConstantValue = false;
15346 }
15347
15348 if (!Value.getNode())
15349 Value = V;
15350 else if (V != Value) {
15351 usesOnlyOneValue = false;
15352 ++NumDifferentLanes;
15353 }
15354
15355 if (PrevVal != V) {
15356 ConsecutiveValCount = 0;
15357 PrevVal = V;
15358 }
15359
15360 // Keep different values and its last consecutive count. For example,
15361 //
15362 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15363 // t24, t24, t24, t24, t24, t24, t24, t24
15364 // t23 = consecutive count 8
15365 // t24 = consecutive count 8
15366 // ------------------------------------------------------------------
15367 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15368 // t24, t24, t24, t24, t24, t24, t24, t24
15369 // t23 = consecutive count 5
15370 // t24 = consecutive count 9
15371 DifferentValueMap[V] = ++ConsecutiveValCount;
15372 }
15373
15374 if (!Value.getNode()) {
15375 LLVM_DEBUG(
15376 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15377 return DAG.getUNDEF(VT);
15378 }
15379
15380 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15381 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15382 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15383 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15384 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15385 "SCALAR_TO_VECTOR node\n");
15386 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15387 }
15388
15389 if (AllLanesExtractElt) {
15390 SDNode *Vector = nullptr;
15391 bool Even = false;
15392 bool Odd = false;
15393 // Check whether the extract elements match the Even pattern <0,2,4,...> or
15394 // the Odd pattern <1,3,5,...>.
15395 for (unsigned i = 0; i < NumElts; ++i) {
15396 SDValue V = Op.getOperand(i);
15397 const SDNode *N = V.getNode();
15398 if (!isa<ConstantSDNode>(N->getOperand(1))) {
15399 Even = false;
15400 Odd = false;
15401 break;
15402 }
15403 SDValue N0 = N->getOperand(0);
15404
15405 // All elements are extracted from the same vector.
15406 if (!Vector) {
15407 Vector = N0.getNode();
15408 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
15409 // BUILD_VECTOR.
15410 if (VT.getVectorElementType() !=
15412 break;
15413 } else if (Vector != N0.getNode()) {
15414 Odd = false;
15415 Even = false;
15416 break;
15417 }
15418
15419 // Extracted values are either at Even indices <0,2,4,...> or at Odd
15420 // indices <1,3,5,...>.
15421 uint64_t Val = N->getConstantOperandVal(1);
15422 if (Val == 2 * i) {
15423 Even = true;
15424 continue;
15425 }
15426 if (Val - 1 == 2 * i) {
15427 Odd = true;
15428 continue;
15429 }
15430
15431 // Something does not match: abort.
15432 Odd = false;
15433 Even = false;
15434 break;
15435 }
15436 if (Even || Odd) {
15437 SDValue LHS =
15439 DAG.getConstant(0, DL, MVT::i64));
15440 SDValue RHS =
15442 DAG.getConstant(NumElts, DL, MVT::i64));
15443
15444 if (Even && !Odd)
15445 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
15446 if (Odd && !Even)
15447 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
15448 }
15449 }
15450
15451 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15452 // i32 and try again.
15453 if (usesOnlyOneValue) {
15454 if (!isConstant) {
15455 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15456 Value.getValueType() != VT) {
15457 LLVM_DEBUG(
15458 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15459 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
15460 }
15461
15462 // This is actually a DUPLANExx operation, which keeps everything vectory.
15463
15464 SDValue Lane = Value.getOperand(1);
15465 Value = Value.getOperand(0);
15466 if (Value.getValueSizeInBits() == 64) {
15467 LLVM_DEBUG(
15468 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15469 "widening it\n");
15470 Value = WidenVector(Value, DAG);
15471 }
15472
15473 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
15474 return DAG.getNode(Opcode, DL, VT, Value, Lane);
15475 }
15476
15479 EVT EltTy = VT.getVectorElementType();
15480 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15481 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15482 LLVM_DEBUG(
15483 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15484 "BITCASTS, and try again\n");
15485 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
15486 for (unsigned i = 0; i < NumElts; ++i)
15487 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
15488 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
15489 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
15490 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15491 Val.dump(););
15492 Val = LowerBUILD_VECTOR(Val, DAG);
15493 if (Val.getNode())
15494 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
15495 }
15496 }
15497
15498 // If we need to insert a small number of different non-constant elements and
15499 // the vector width is sufficiently large, prefer using DUP with the common
15500 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15501 // skip the constant lane handling below.
15502 bool PreferDUPAndInsert =
15503 !isConstant && NumDifferentLanes >= 1 &&
15504 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15505 NumDifferentLanes >= NumConstantLanes;
15506
15507 // If there was only one constant value used and for more than one lane,
15508 // start by splatting that value, then replace the non-constant lanes. This
15509 // is better than the default, which will perform a separate initialization
15510 // for each lane.
15511 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15512 // Firstly, try to materialize the splat constant.
15513 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
15514 unsigned BitSize = VT.getScalarSizeInBits();
15515 APInt ConstantValueAPInt(1, 0);
15516 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15517 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15518 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15519 !ConstantValueAPInt.isAllOnes()) {
15520 Val = ConstantBuildVector(Val, DAG, Subtarget);
15521 if (!Val)
15522 // Otherwise, materialize the constant and splat it.
15523 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
15524 }
15525
15526 // Now insert the non-constant lanes.
15527 for (unsigned i = 0; i < NumElts; ++i) {
15528 SDValue V = Op.getOperand(i);
15529 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15530 if (!isIntOrFPConstant(V))
15531 // Note that type legalization likely mucked about with the VT of the
15532 // source operand, so we may have to convert it here before inserting.
15533 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
15534 }
15535 return Val;
15536 }
15537
15538 // This will generate a load from the constant pool.
15539 if (isConstant) {
15540 LLVM_DEBUG(
15541 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15542 "expansion\n");
15543 return SDValue();
15544 }
15545
15546 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15547 // v4i32s. This is really a truncate, which we can construct out of (legal)
15548 // concats and truncate nodes.
15550 return M;
15551
15552 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15553 if (NumElts >= 4) {
15554 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15555 return Shuffle;
15556
15557 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15558 return Shuffle;
15559 }
15560
15561 if (PreferDUPAndInsert) {
15562 // First, build a constant vector with the common element.
15563 SmallVector<SDValue, 8> Ops(NumElts, Value);
15564 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
15565 // Next, insert the elements that do not match the common value.
15566 for (unsigned I = 0; I < NumElts; ++I)
15567 if (Op.getOperand(I) != Value)
15568 NewVector =
15569 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
15570 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
15571
15572 return NewVector;
15573 }
15574
15575 // If vector consists of two different values, try to generate two DUPs and
15576 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15577 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15579 // Check the consecutive count of the value is the half number of vector
15580 // elements. In this case, we can use CONCAT_VECTORS. For example,
15581 //
15582 // canUseVECTOR_CONCAT = true;
15583 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15584 // t24, t24, t24, t24, t24, t24, t24, t24
15585 //
15586 // canUseVECTOR_CONCAT = false;
15587 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15588 // t24, t24, t24, t24, t24, t24, t24, t24
15589 bool canUseVECTOR_CONCAT = true;
15590 for (auto Pair : DifferentValueMap) {
15591 // Check different values have same length which is NumElts / 2.
15592 if (Pair.second != NumElts / 2)
15593 canUseVECTOR_CONCAT = false;
15594 Vals.push_back(Pair.first);
15595 }
15596
15597 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15598 // CONCAT_VECTORs. For example,
15599 //
15600 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15601 // t24, t24, t24, t24, t24, t24, t24, t24
15602 // ==>
15603 // t26: v8i8 = AArch64ISD::DUP t23
15604 // t28: v8i8 = AArch64ISD::DUP t24
15605 // t29: v16i8 = concat_vectors t26, t28
15606 if (canUseVECTOR_CONCAT) {
15607 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15608 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15609 SubVT.getVectorNumElements() >= 2) {
15610 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15611 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15612 SDValue DUP1 =
15613 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
15614 SDValue DUP2 =
15615 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
15617 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
15618 return CONCAT_VECTORS;
15619 }
15620 }
15621
15622 // Let's try to generate VECTOR_SHUFFLE. For example,
15623 //
15624 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15625 // ==>
15626 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15627 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15628 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15629 if (NumElts >= 8) {
15630 SmallVector<int, 16> MaskVec;
15631 // Build mask for VECTOR_SHUFLLE.
15632 SDValue FirstLaneVal = Op.getOperand(0);
15633 for (unsigned i = 0; i < NumElts; ++i) {
15634 SDValue Val = Op.getOperand(i);
15635 if (FirstLaneVal == Val)
15636 MaskVec.push_back(i);
15637 else
15638 MaskVec.push_back(i + NumElts);
15639 }
15640
15641 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15642 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15643 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
15644 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
15646 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
15647 return VECTOR_SHUFFLE;
15648 }
15649 }
15650
15651 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15652 // know the default expansion would otherwise fall back on something even
15653 // worse. For a vector with one or two non-undef values, that's
15654 // scalar_to_vector for the elements followed by a shuffle (provided the
15655 // shuffle is valid for the target) and materialization element by element
15656 // on the stack followed by a load for everything else.
15657 if (!isConstant && !usesOnlyOneValue) {
15658 LLVM_DEBUG(
15659 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15660 "of INSERT_VECTOR_ELT\n");
15661
15662 SDValue Vec = DAG.getUNDEF(VT);
15663 SDValue Op0 = Op.getOperand(0);
15664 unsigned i = 0;
15665
15666 // Use SCALAR_TO_VECTOR for lane zero to
15667 // a) Avoid a RMW dependency on the full vector register, and
15668 // b) Allow the register coalescer to fold away the copy if the
15669 // value is already in an S or D register, and we're forced to emit an
15670 // INSERT_SUBREG that we can't fold anywhere.
15671 //
15672 // We also allow types like i8 and i16 which are illegal scalar but legal
15673 // vector element types. After type-legalization the inserted value is
15674 // extended (i32) and it is safe to cast them to the vector type by ignoring
15675 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15676 if (!Op0.isUndef()) {
15677 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15678 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
15679 ++i;
15680 }
15681 LLVM_DEBUG({
15682 if (i < NumElts)
15683 dbgs() << "Creating nodes for the other vector elements:\n";
15684 });
15685 for (; i < NumElts; ++i) {
15686 SDValue V = Op.getOperand(i);
15687 if (V.isUndef())
15688 continue;
15689 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15690 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
15691 }
15692 return Vec;
15693 }
15694
15695 LLVM_DEBUG(
15696 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15697 "better alternative\n");
15698 return SDValue();
15699}
15700
15701SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15702 SelectionDAG &DAG) const {
15703 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15704 !Subtarget->isNeonAvailable()))
15705 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15706
15707 assert(Op.getValueType().isScalableVector() &&
15708 isTypeLegal(Op.getValueType()) &&
15709 "Expected legal scalable vector type!");
15710
15711 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15712 unsigned NumOperands = Op->getNumOperands();
15713 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15714 "Unexpected number of operands in CONCAT_VECTORS");
15715
15716 if (NumOperands == 2)
15717 return Op;
15718
15719 // Concat each pair of subvectors and pack into the lower half of the array.
15720 SmallVector<SDValue> ConcatOps(Op->ops());
15721 while (ConcatOps.size() > 1) {
15722 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15723 SDValue V1 = ConcatOps[I];
15724 SDValue V2 = ConcatOps[I + 1];
15725 EVT SubVT = V1.getValueType();
15726 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15727 ConcatOps[I / 2] =
15728 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15729 }
15730 ConcatOps.resize(ConcatOps.size() / 2);
15731 }
15732 return ConcatOps[0];
15733 }
15734
15735 return SDValue();
15736}
15737
15738SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15739 SelectionDAG &DAG) const {
15740 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15741
15742 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15743 !Subtarget->isNeonAvailable()))
15744 return LowerFixedLengthInsertVectorElt(Op, DAG);
15745
15746 EVT VT = Op.getOperand(0).getValueType();
15747
15748 if (VT.getScalarType() == MVT::i1) {
15749 EVT VectorVT = getPromotedVTForPredicate(VT);
15750 SDLoc DL(Op);
15751 SDValue ExtendedVector =
15752 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15753 SDValue ExtendedValue =
15754 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15755 VectorVT.getScalarType().getSizeInBits() < 32
15756 ? MVT::i32
15757 : VectorVT.getScalarType());
15758 ExtendedVector =
15759 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15760 ExtendedValue, Op.getOperand(2));
15761 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15762 }
15763
15764 // Check for non-constant or out of range lane.
15765 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15766 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15767 return SDValue();
15768
15769 return Op;
15770}
15771
15772SDValue
15773AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15774 SelectionDAG &DAG) const {
15775 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15776 EVT VT = Op.getOperand(0).getValueType();
15777
15778 if (VT.getScalarType() == MVT::i1) {
15779 // We can't directly extract from an SVE predicate; extend it first.
15780 // (This isn't the only possible lowering, but it's straightforward.)
15781 EVT VectorVT = getPromotedVTForPredicate(VT);
15782 SDLoc DL(Op);
15783 SDValue Extend =
15784 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15785 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15786 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15787 Extend, Op.getOperand(1));
15788 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15789 }
15790
15791 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15792 return LowerFixedLengthExtractVectorElt(Op, DAG);
15793
15794 // Check for non-constant or out of range lane.
15795 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15796 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15797 return SDValue();
15798
15799 // Insertion/extraction are legal for V128 types.
15800 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15801 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15802 VT == MVT::v8f16 || VT == MVT::v8bf16)
15803 return Op;
15804
15805 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15806 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15807 VT != MVT::v4bf16)
15808 return SDValue();
15809
15810 // For V64 types, we perform extraction by expanding the value
15811 // to a V128 type and perform the extraction on that.
15812 SDLoc DL(Op);
15813 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15814 EVT WideTy = WideVec.getValueType();
15815
15816 EVT ExtrTy = WideTy.getVectorElementType();
15817 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15818 ExtrTy = MVT::i32;
15819
15820 // For extractions, we just return the result directly.
15821 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15822 Op.getOperand(1));
15823}
15824
15825SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15826 SelectionDAG &DAG) const {
15827 EVT VT = Op.getValueType();
15829 "Only cases that extract a fixed length vector are supported!");
15830 EVT InVT = Op.getOperand(0).getValueType();
15831
15832 // If we don't have legal types yet, do nothing
15833 if (!isTypeLegal(InVT))
15834 return SDValue();
15835
15836 if (InVT.is128BitVector()) {
15837 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15838 unsigned Idx = Op.getConstantOperandVal(1);
15839
15840 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15841 if (Idx == 0)
15842 return Op;
15843
15844 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15845 // that directly.
15846 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15847 return Op;
15848 }
15849
15850 if (InVT.isScalableVector() ||
15851 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
15852 SDLoc DL(Op);
15853 SDValue Vec = Op.getOperand(0);
15854 SDValue Idx = Op.getOperand(1);
15855
15857 if (PackedVT != InVT) {
15858 // Pack input into the bottom part of an SVE register and try again.
15859 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
15860 DAG.getUNDEF(PackedVT), Vec,
15861 DAG.getVectorIdxConstant(0, DL));
15862 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
15863 }
15864
15865 // This will get matched by custom code during ISelDAGToDAG.
15866 if (isNullConstant(Idx))
15867 return Op;
15868
15869 assert(InVT.isScalableVector() && "Unexpected vector type!");
15870 // Move requested subvector to the start of the vector and try again.
15871 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
15872 return convertFromScalableVector(DAG, VT, Splice);
15873 }
15874
15875 return SDValue();
15876}
15877
15878SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15879 SelectionDAG &DAG) const {
15880 assert(Op.getValueType().isScalableVector() &&
15881 "Only expect to lower inserts into scalable vectors!");
15882
15883 EVT InVT = Op.getOperand(1).getValueType();
15884 unsigned Idx = Op.getConstantOperandVal(2);
15885
15886 SDValue Vec0 = Op.getOperand(0);
15887 SDValue Vec1 = Op.getOperand(1);
15888 SDLoc DL(Op);
15889 EVT VT = Op.getValueType();
15890
15891 if (InVT.isScalableVector()) {
15892 if (!isTypeLegal(VT))
15893 return SDValue();
15894
15895 // Break down insert_subvector into simpler parts.
15896 if (VT.getVectorElementType() == MVT::i1) {
15897 unsigned NumElts = VT.getVectorMinNumElements();
15898 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15899
15900 SDValue Lo, Hi;
15901 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15902 DAG.getVectorIdxConstant(0, DL));
15903 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15904 DAG.getVectorIdxConstant(NumElts / 2, DL));
15905 if (Idx < (NumElts / 2))
15906 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
15908 else
15909 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
15910 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
15911
15912 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15913 }
15914
15915 // We can select these directly.
15916 if (isTypeLegal(InVT) && Vec0.isUndef())
15917 return Op;
15918
15919 // Ensure the subvector is half the size of the main vector.
15920 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15921 return SDValue();
15922
15923 // Here narrow and wide refers to the vector element types. After "casting"
15924 // both vectors must have the same bit length and so because the subvector
15925 // has fewer elements, those elements need to be bigger.
15928
15929 // NOP cast operands to the largest legal vector of the same element count.
15930 if (VT.isFloatingPoint()) {
15931 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
15932 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
15933 } else {
15934 // Legal integer vectors are already their largest so Vec0 is fine as is.
15935 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
15936 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
15937 }
15938
15939 // To replace the top/bottom half of vector V with vector SubV we widen the
15940 // preserved half of V, concatenate this to SubV (the order depending on the
15941 // half being replaced) and then narrow the result.
15942 SDValue Narrow;
15943 if (Idx == 0) {
15944 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
15945 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
15946 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
15947 } else {
15949 "Invalid subvector index!");
15950 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
15951 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
15952 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
15953 }
15954
15955 return getSVESafeBitCast(VT, Narrow, DAG);
15956 }
15957
15958 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15959 // This will be matched by custom code during ISelDAGToDAG.
15960 if (Vec0.isUndef())
15961 return Op;
15962
15963 std::optional<unsigned> PredPattern =
15965 auto PredTy = VT.changeVectorElementType(MVT::i1);
15966 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
15967 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
15968 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
15969 }
15970
15971 return SDValue();
15972}
15973
15974static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15975 if (Op.getOpcode() != AArch64ISD::DUP &&
15976 Op.getOpcode() != ISD::SPLAT_VECTOR &&
15977 Op.getOpcode() != ISD::BUILD_VECTOR)
15978 return false;
15979
15980 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15981 !isAllConstantBuildVector(Op, SplatVal))
15982 return false;
15983
15984 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15985 !isa<ConstantSDNode>(Op->getOperand(0)))
15986 return false;
15987
15988 SplatVal = Op->getConstantOperandVal(0);
15989 if (Op.getValueType().getVectorElementType() != MVT::i64)
15990 SplatVal = (int32_t)SplatVal;
15991
15992 Negated = false;
15993 if (isPowerOf2_64(SplatVal))
15994 return true;
15995
15996 Negated = true;
15997 if (isPowerOf2_64(-SplatVal)) {
15998 SplatVal = -SplatVal;
15999 return true;
16000 }
16001
16002 return false;
16003}
16004
16005SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16006 EVT VT = Op.getValueType();
16007 SDLoc DL(Op);
16008
16009 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16010 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16011
16012 assert(VT.isScalableVector() && "Expected a scalable vector.");
16013
16014 bool Signed = Op.getOpcode() == ISD::SDIV;
16015 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16016
16017 bool Negated;
16018 uint64_t SplatVal;
16019 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
16021 SDValue Res =
16022 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16023 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16024 if (Negated)
16025 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16026
16027 return Res;
16028 }
16029
16030 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16031 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16032
16033 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16034 // operations, and truncate the result.
16035 EVT WidenedVT;
16036 if (VT == MVT::nxv16i8)
16037 WidenedVT = MVT::nxv8i16;
16038 else if (VT == MVT::nxv8i16)
16039 WidenedVT = MVT::nxv4i32;
16040 else
16041 llvm_unreachable("Unexpected Custom DIV operation");
16042
16043 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16044 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16045 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16046 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16047 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16048 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16049 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16050 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16051 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16052 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16053 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16054}
16055
16056bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16057 EVT VT, unsigned DefinedValues) const {
16058 if (!Subtarget->isNeonAvailable())
16059 return false;
16061}
16062
16064 // Currently no fixed length shuffles that require SVE are legal.
16065 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16066 return false;
16067
16068 if (VT.getVectorNumElements() == 4 &&
16069 (VT.is128BitVector() || VT.is64BitVector())) {
16070 unsigned Cost = getPerfectShuffleCost(M);
16071 if (Cost <= 1)
16072 return true;
16073 }
16074
16075 bool DummyBool;
16076 int DummyInt;
16077 unsigned DummyUnsigned;
16078
16079 unsigned EltSize = VT.getScalarSizeInBits();
16080 unsigned NumElts = VT.getVectorNumElements();
16082 isREVMask(M, EltSize, NumElts, 64) ||
16083 isREVMask(M, EltSize, NumElts, 32) ||
16084 isREVMask(M, EltSize, NumElts, 16) ||
16085 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16086 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16087 isTRNMask(M, NumElts, DummyUnsigned) ||
16088 isUZPMask(M, NumElts, DummyUnsigned) ||
16089 isZIPMask(M, NumElts, DummyUnsigned) ||
16090 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16091 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16092 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16093 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16094 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16095}
16096
16098 EVT VT) const {
16099 // Just delegate to the generic legality, clear masks aren't special.
16100 return isShuffleMaskLegal(M, VT);
16101}
16102
16103/// getVShiftImm - Check if this is a valid build_vector for the immediate
16104/// operand of a vector shift operation, where all the elements of the
16105/// build_vector must have the same constant integer value.
16106static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16107 // Ignore bit_converts.
16108 while (Op.getOpcode() == ISD::BITCAST)
16109 Op = Op.getOperand(0);
16110 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
16111 APInt SplatBits, SplatUndef;
16112 unsigned SplatBitSize;
16113 bool HasAnyUndefs;
16114 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16115 HasAnyUndefs, ElementBits) ||
16116 SplatBitSize > ElementBits)
16117 return false;
16118 Cnt = SplatBits.getSExtValue();
16119 return true;
16120}
16121
16122/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16123/// operand of a vector shift left operation. That value must be in the range:
16124/// 0 <= Value < ElementBits for a left shift; or
16125/// 0 <= Value <= ElementBits for a long left shift.
16126static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16127 assert(VT.isVector() && "vector shift count is not a vector type");
16128 int64_t ElementBits = VT.getScalarSizeInBits();
16129 if (!getVShiftImm(Op, ElementBits, Cnt))
16130 return false;
16131 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16132}
16133
16134/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16135/// operand of a vector shift right operation. The value must be in the range:
16136/// 1 <= Value <= ElementBits for a right shift; or
16137static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16138 assert(VT.isVector() && "vector shift count is not a vector type");
16139 int64_t ElementBits = VT.getScalarSizeInBits();
16140 if (!getVShiftImm(Op, ElementBits, Cnt))
16141 return false;
16142 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16143}
16144
16145SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16146 SelectionDAG &DAG) const {
16147 EVT VT = Op.getValueType();
16148
16149 if (VT.getScalarType() == MVT::i1) {
16150 // Lower i1 truncate to `(x & 1) != 0`.
16151 SDLoc DL(Op);
16152 EVT OpVT = Op.getOperand(0).getValueType();
16153 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16154 SDValue One = DAG.getConstant(1, DL, OpVT);
16155 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16156 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16157 }
16158
16159 if (!VT.isVector() || VT.isScalableVector())
16160 return SDValue();
16161
16162 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16163 !Subtarget->isNeonAvailable()))
16164 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16165
16166 return SDValue();
16167}
16168
16169// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16170// possibly a truncated type, it tells how many bits of the value are to be
16171// used.
16173 SelectionDAG &DAG,
16174 unsigned &ShiftValue,
16175 SDValue &RShOperand) {
16176 if (Shift->getOpcode() != ISD::SRL)
16177 return false;
16178
16179 EVT VT = Shift.getValueType();
16180 assert(VT.isScalableVT());
16181
16182 auto ShiftOp1 =
16183 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
16184 if (!ShiftOp1)
16185 return false;
16186
16187 ShiftValue = ShiftOp1->getZExtValue();
16188 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16189 return false;
16190
16191 SDValue Add = Shift->getOperand(0);
16192 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16193 return false;
16194
16196 "ResVT must be truncated or same type as the shift.");
16197 // Check if an overflow can lead to incorrect results.
16198 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16199 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16200 return false;
16201
16202 auto AddOp1 =
16203 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
16204 if (!AddOp1)
16205 return false;
16206 uint64_t AddValue = AddOp1->getZExtValue();
16207 if (AddValue != 1ULL << (ShiftValue - 1))
16208 return false;
16209
16210 RShOperand = Add->getOperand(0);
16211 return true;
16212}
16213
16214SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16215 SelectionDAG &DAG) const {
16216 EVT VT = Op.getValueType();
16217 SDLoc DL(Op);
16218 int64_t Cnt;
16219
16220 if (!Op.getOperand(1).getValueType().isVector())
16221 return Op;
16222 unsigned EltSize = VT.getScalarSizeInBits();
16223
16224 switch (Op.getOpcode()) {
16225 case ISD::SHL:
16226 if (VT.isScalableVector() ||
16228 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16229
16230 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16231 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16232 DAG.getConstant(Cnt, DL, MVT::i32));
16233 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
16234 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
16235 MVT::i32),
16236 Op.getOperand(0), Op.getOperand(1));
16237 case ISD::SRA:
16238 case ISD::SRL:
16239 if (VT.isScalableVector() &&
16240 (Subtarget->hasSVE2() ||
16241 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16242 SDValue RShOperand;
16243 unsigned ShiftValue;
16244 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16245 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16246 getPredicateForVector(DAG, DL, VT), RShOperand,
16247 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16248 }
16249
16250 if (VT.isScalableVector() ||
16251 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16252 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16253 : AArch64ISD::SRL_PRED;
16254 return LowerToPredicatedOp(Op, DAG, Opc);
16255 }
16256
16257 // Right shift immediate
16258 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16259 unsigned Opc =
16260 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16261 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16262 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
16263 }
16264
16265 // Right shift register. Note, there is not a shift right register
16266 // instruction, but the shift left register instruction takes a signed
16267 // value, where negative numbers specify a right shift.
16268 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16269 : Intrinsic::aarch64_neon_ushl;
16270 // negate the shift amount
16271 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16272 Op.getOperand(1));
16273 SDValue NegShiftLeft =
16275 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16276 NegShift);
16277 return NegShiftLeft;
16278 }
16279
16280 llvm_unreachable("unexpected shift opcode");
16281}
16282
16283SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16284 SelectionDAG &DAG) const {
16285 if (Op.getValueType().isScalableVector())
16286 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16287
16288 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16289 !Subtarget->isNeonAvailable()))
16290 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16291
16292 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16293 SDValue LHS = Op.getOperand(0);
16294 SDValue RHS = Op.getOperand(1);
16295 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16296 SDLoc DL(Op);
16297
16298 if (LHS.getValueType().getVectorElementType().isInteger())
16299 return Op;
16300
16301 assert(((!Subtarget->hasFullFP16() &&
16302 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16303 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16304 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16305 "Unexpected type!");
16306
16307 // Lower isnan(x) | isnan(never-nan) to x != x.
16308 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16309 if (CC == ISD::SETUO || CC == ISD::SETO) {
16310 bool OneNaN = false;
16311 if (LHS == RHS) {
16312 OneNaN = true;
16313 } else if (DAG.isKnownNeverNaN(RHS)) {
16314 OneNaN = true;
16315 RHS = LHS;
16316 } else if (DAG.isKnownNeverNaN(LHS)) {
16317 OneNaN = true;
16318 LHS = RHS;
16319 }
16320 if (OneNaN) {
16321 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16322 }
16323 }
16324
16325 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16326 // clean. Some of them require two branches to implement.
16327 AArch64CC::CondCode CC1, CC2;
16328 bool ShouldInvert;
16329 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16330
16331 bool NoNaNs =
16332 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16333 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16334 if (!Cmp.getNode())
16335 return SDValue();
16336
16337 if (CC2 != AArch64CC::AL) {
16338 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16339 if (!Cmp2.getNode())
16340 return SDValue();
16341
16342 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16343 }
16344
16345 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16346
16347 if (ShouldInvert)
16348 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16349
16350 return Cmp;
16351}
16352
16353static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16354 SelectionDAG &DAG) {
16355 SDValue VecOp = ScalarOp.getOperand(0);
16356 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16357 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16358 DAG.getConstant(0, DL, MVT::i64));
16359}
16360
16361static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16362 SDLoc DL, SelectionDAG &DAG) {
16363 unsigned ScalarOpcode;
16364 switch (Opcode) {
16365 case ISD::VECREDUCE_AND:
16366 ScalarOpcode = ISD::AND;
16367 break;
16368 case ISD::VECREDUCE_OR:
16369 ScalarOpcode = ISD::OR;
16370 break;
16371 case ISD::VECREDUCE_XOR:
16372 ScalarOpcode = ISD::XOR;
16373 break;
16374 default:
16375 llvm_unreachable("Expected bitwise vector reduction");
16376 return SDValue();
16377 }
16378
16379 EVT VecVT = Vec.getValueType();
16380 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16381 "Expected power-of-2 length vector");
16382
16383 EVT ElemVT = VecVT.getVectorElementType();
16384
16385 SDValue Result;
16386 unsigned NumElems = VecVT.getVectorNumElements();
16387
16388 // Special case for boolean reductions
16389 if (ElemVT == MVT::i1) {
16390 // Split large vectors into smaller ones
16391 if (NumElems > 16) {
16392 SDValue Lo, Hi;
16393 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16394 EVT HalfVT = Lo.getValueType();
16395 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16396 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16397 }
16398
16399 // Results of setcc operations get widened to 128 bits if their input
16400 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16401 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16402 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16403 // size leads to the best codegen, since e.g. setcc results might need to be
16404 // truncated otherwise.
16405 unsigned ExtendedWidth = 64;
16406 if (Vec.getOpcode() == ISD::SETCC &&
16407 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16408 ExtendedWidth = 128;
16409 }
16410 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16411
16412 // any_ext doesn't work with umin/umax, so only use it for uadd.
16413 unsigned ExtendOp =
16414 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16415 SDValue Extended = DAG.getNode(
16416 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16417 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16418 // in that case we bitcast the sign extended values from v2i64 to v4i32
16419 // before reduction for optimal code generation.
16420 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16421 NumElems == 2 && ExtendedWidth == 128) {
16422 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16423 ExtendedVT = MVT::i32;
16424 }
16425 switch (ScalarOpcode) {
16426 case ISD::AND:
16427 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16428 break;
16429 case ISD::OR:
16430 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16431 break;
16432 case ISD::XOR:
16433 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16434 break;
16435 default:
16436 llvm_unreachable("Unexpected Opcode");
16437 }
16438
16439 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16440 } else {
16441 // Iteratively split the vector in half and combine using the bitwise
16442 // operation until it fits in a 64 bit register.
16443 while (VecVT.getSizeInBits() > 64) {
16444 SDValue Lo, Hi;
16445 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16446 VecVT = Lo.getValueType();
16447 NumElems = VecVT.getVectorNumElements();
16448 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16449 }
16450
16451 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16452
16453 // Do the remaining work on a scalar since it allows the code generator to
16454 // combine the shift and bitwise operation into one instruction and since
16455 // integer instructions can have higher throughput than vector instructions.
16456 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16457
16458 // Iteratively combine the lower and upper halves of the scalar using the
16459 // bitwise operation, halving the relevant region of the scalar in each
16460 // iteration, until the relevant region is just one element of the original
16461 // vector.
16462 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16463 SDValue ShiftAmount =
16464 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16465 SDValue Shifted =
16466 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16467 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16468 }
16469
16470 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16471 }
16472
16473 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16474}
16475
16476SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16477 SelectionDAG &DAG) const {
16478 SDValue Src = Op.getOperand(0);
16479 EVT SrcVT = Src.getValueType();
16480
16481 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
16482 // widening by inserting zeroes.
16483 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
16484 SrcVT == MVT::v2f16) {
16485 SDLoc DL(Op);
16486 return DAG.getNode(ISD::FADD, DL, MVT::f16,
16487 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
16488 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
16489 }
16490
16491 // Try to lower fixed length reductions to SVE.
16492 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16493 Op.getOpcode() == ISD::VECREDUCE_AND ||
16494 Op.getOpcode() == ISD::VECREDUCE_OR ||
16495 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16496 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16497 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16498 SrcVT.getVectorElementType() == MVT::i64);
16499 if (SrcVT.isScalableVector() ||
16501 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16502
16503 if (SrcVT.getVectorElementType() == MVT::i1)
16504 return LowerPredReductionToSVE(Op, DAG);
16505
16506 switch (Op.getOpcode()) {
16507 case ISD::VECREDUCE_ADD:
16508 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16509 case ISD::VECREDUCE_AND:
16510 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16511 case ISD::VECREDUCE_OR:
16512 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16514 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16516 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16518 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16520 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16521 case ISD::VECREDUCE_XOR:
16522 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16524 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16526 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16528 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16530 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16532 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16533 default:
16534 llvm_unreachable("Unhandled fixed length reduction");
16535 }
16536 }
16537
16538 // Lower NEON reductions.
16539 SDLoc DL(Op);
16540 switch (Op.getOpcode()) {
16541 case ISD::VECREDUCE_AND:
16542 case ISD::VECREDUCE_OR:
16543 case ISD::VECREDUCE_XOR:
16544 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16545 Op.getValueType(), DL, DAG);
16546 case ISD::VECREDUCE_ADD:
16547 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
16549 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
16551 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
16553 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
16555 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
16556 default:
16557 llvm_unreachable("Unhandled reduction");
16558 }
16559}
16560
16561SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16562 SelectionDAG &DAG) const {
16563 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16564 // No point replacing if we don't have the relevant instruction/libcall anyway
16565 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16566 return SDValue();
16567
16568 // LSE has an atomic load-clear instruction, but not a load-and.
16569 SDLoc DL(Op);
16570 MVT VT = Op.getSimpleValueType();
16571 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16572 SDValue RHS = Op.getOperand(2);
16573 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16574 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
16575 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
16576 Op.getOperand(0), Op.getOperand(1), RHS,
16577 AN->getMemOperand());
16578}
16579
16580SDValue
16581AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16582 SelectionDAG &DAG) const {
16583
16584 SDLoc DL(Op);
16585 // Get the inputs.
16586 SDNode *Node = Op.getNode();
16587 SDValue Chain = Op.getOperand(0);
16588 SDValue Size = Op.getOperand(1);
16590 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16591 EVT VT = Node->getValueType(0);
16592
16594 "no-stack-arg-probe")) {
16595 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16596 Chain = SP.getValue(1);
16597 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16598 if (Align)
16599 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16600 DAG.getSignedConstant(-Align->value(), DL, VT));
16601 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16602 SDValue Ops[2] = {SP, Chain};
16603 return DAG.getMergeValues(Ops, DL);
16604 }
16605
16606 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16607
16608 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16610 PtrVT, 0);
16611
16612 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16613 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16614 if (Subtarget->hasCustomCallingConv())
16615 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16616
16617 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
16618 DAG.getConstant(4, DL, MVT::i64));
16619 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
16620 Chain =
16621 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
16622 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16623 DAG.getRegisterMask(Mask), Chain.getValue(1));
16624 // To match the actual intent better, we should read the output from X15 here
16625 // again (instead of potentially spilling it to the stack), but rereading Size
16626 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16627 // here.
16628
16629 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
16630 DAG.getConstant(4, DL, MVT::i64));
16631
16632 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16633 Chain = SP.getValue(1);
16634 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16635 if (Align)
16636 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16637 DAG.getSignedConstant(-Align->value(), DL, VT));
16638 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16639
16640 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
16641
16642 SDValue Ops[2] = {SP, Chain};
16643 return DAG.getMergeValues(Ops, DL);
16644}
16645
16646SDValue
16647AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16648 SelectionDAG &DAG) const {
16649 // Get the inputs.
16650 SDNode *Node = Op.getNode();
16651 SDValue Chain = Op.getOperand(0);
16652 SDValue Size = Op.getOperand(1);
16653
16655 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16656 SDLoc DL(Op);
16657 EVT VT = Node->getValueType(0);
16658
16659 // Construct the new SP value in a GPR.
16660 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16661 Chain = SP.getValue(1);
16662 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16663 if (Align)
16664 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16665 DAG.getSignedConstant(-Align->value(), DL, VT));
16666
16667 // Set the real SP to the new value with a probing loop.
16668 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
16669 SDValue Ops[2] = {SP, Chain};
16670 return DAG.getMergeValues(Ops, DL);
16671}
16672
16673SDValue
16674AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16675 SelectionDAG &DAG) const {
16677
16678 if (Subtarget->isTargetWindows())
16679 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16680 else if (hasInlineStackProbe(MF))
16681 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16682 else
16683 return SDValue();
16684}
16685
16686SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16687 unsigned NewOp) const {
16688 if (Subtarget->hasSVE2())
16689 return LowerToPredicatedOp(Op, DAG, NewOp);
16690
16691 // Default to expand.
16692 return SDValue();
16693}
16694
16695SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16696 SelectionDAG &DAG) const {
16697 EVT VT = Op.getValueType();
16698 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16699
16700 SDLoc DL(Op);
16701 APInt MulImm = Op.getConstantOperandAPInt(0);
16702 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16703 VT);
16704}
16705
16706/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16707template <unsigned NumVecs>
16708static bool
16712 // Retrieve EC from first vector argument.
16713 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16715#ifndef NDEBUG
16716 // Check the assumption that all input vectors are the same type.
16717 for (unsigned I = 0; I < NumVecs; ++I)
16718 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16719 "Invalid type.");
16720#endif
16721 // memVT is `NumVecs * VT`.
16723 EC * NumVecs);
16724 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16725 Info.offset = 0;
16726 Info.align.reset();
16728 return true;
16729}
16730
16731/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16732/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16733/// specified in the intrinsic calls.
16735 const CallInst &I,
16736 MachineFunction &MF,
16737 unsigned Intrinsic) const {
16738 auto &DL = I.getDataLayout();
16739 switch (Intrinsic) {
16740 case Intrinsic::aarch64_sve_st2:
16741 return setInfoSVEStN<2>(*this, DL, Info, I);
16742 case Intrinsic::aarch64_sve_st3:
16743 return setInfoSVEStN<3>(*this, DL, Info, I);
16744 case Intrinsic::aarch64_sve_st4:
16745 return setInfoSVEStN<4>(*this, DL, Info, I);
16746 case Intrinsic::aarch64_neon_ld2:
16747 case Intrinsic::aarch64_neon_ld3:
16748 case Intrinsic::aarch64_neon_ld4:
16749 case Intrinsic::aarch64_neon_ld1x2:
16750 case Intrinsic::aarch64_neon_ld1x3:
16751 case Intrinsic::aarch64_neon_ld1x4: {
16753 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16754 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16755 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16756 Info.offset = 0;
16757 Info.align.reset();
16758 // volatile loads with NEON intrinsics not supported
16760 return true;
16761 }
16762 case Intrinsic::aarch64_neon_ld2lane:
16763 case Intrinsic::aarch64_neon_ld3lane:
16764 case Intrinsic::aarch64_neon_ld4lane:
16765 case Intrinsic::aarch64_neon_ld2r:
16766 case Intrinsic::aarch64_neon_ld3r:
16767 case Intrinsic::aarch64_neon_ld4r: {
16769 // ldx return struct with the same vec type
16770 Type *RetTy = I.getType();
16771 auto *StructTy = cast<StructType>(RetTy);
16772 unsigned NumElts = StructTy->getNumElements();
16773 Type *VecTy = StructTy->getElementType(0);
16774 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16775 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16776 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16777 Info.offset = 0;
16778 Info.align.reset();
16779 // volatile loads with NEON intrinsics not supported
16781 return true;
16782 }
16783 case Intrinsic::aarch64_neon_st2:
16784 case Intrinsic::aarch64_neon_st3:
16785 case Intrinsic::aarch64_neon_st4:
16786 case Intrinsic::aarch64_neon_st1x2:
16787 case Intrinsic::aarch64_neon_st1x3:
16788 case Intrinsic::aarch64_neon_st1x4: {
16790 unsigned NumElts = 0;
16791 for (const Value *Arg : I.args()) {
16792 Type *ArgTy = Arg->getType();
16793 if (!ArgTy->isVectorTy())
16794 break;
16795 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16796 }
16797 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16798 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16799 Info.offset = 0;
16800 Info.align.reset();
16801 // volatile stores with NEON intrinsics not supported
16803 return true;
16804 }
16805 case Intrinsic::aarch64_neon_st2lane:
16806 case Intrinsic::aarch64_neon_st3lane:
16807 case Intrinsic::aarch64_neon_st4lane: {
16809 unsigned NumElts = 0;
16810 // all the vector type is same
16811 Type *VecTy = I.getArgOperand(0)->getType();
16812 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16813
16814 for (const Value *Arg : I.args()) {
16815 Type *ArgTy = Arg->getType();
16816 if (!ArgTy->isVectorTy())
16817 break;
16818 NumElts += 1;
16819 }
16820
16821 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16822 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16823 Info.offset = 0;
16824 Info.align.reset();
16825 // volatile stores with NEON intrinsics not supported
16827 return true;
16828 }
16829 case Intrinsic::aarch64_ldaxr:
16830 case Intrinsic::aarch64_ldxr: {
16831 Type *ValTy = I.getParamElementType(0);
16833 Info.memVT = MVT::getVT(ValTy);
16834 Info.ptrVal = I.getArgOperand(0);
16835 Info.offset = 0;
16836 Info.align = DL.getABITypeAlign(ValTy);
16838 return true;
16839 }
16840 case Intrinsic::aarch64_stlxr:
16841 case Intrinsic::aarch64_stxr: {
16842 Type *ValTy = I.getParamElementType(1);
16844 Info.memVT = MVT::getVT(ValTy);
16845 Info.ptrVal = I.getArgOperand(1);
16846 Info.offset = 0;
16847 Info.align = DL.getABITypeAlign(ValTy);
16849 return true;
16850 }
16851 case Intrinsic::aarch64_ldaxp:
16852 case Intrinsic::aarch64_ldxp:
16854 Info.memVT = MVT::i128;
16855 Info.ptrVal = I.getArgOperand(0);
16856 Info.offset = 0;
16857 Info.align = Align(16);
16859 return true;
16860 case Intrinsic::aarch64_stlxp:
16861 case Intrinsic::aarch64_stxp:
16863 Info.memVT = MVT::i128;
16864 Info.ptrVal = I.getArgOperand(2);
16865 Info.offset = 0;
16866 Info.align = Align(16);
16868 return true;
16869 case Intrinsic::aarch64_sve_ldnt1: {
16870 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
16872 Info.memVT = MVT::getVT(I.getType());
16873 Info.ptrVal = I.getArgOperand(1);
16874 Info.offset = 0;
16875 Info.align = DL.getABITypeAlign(ElTy);
16877 return true;
16878 }
16879 case Intrinsic::aarch64_sve_stnt1: {
16880 Type *ElTy =
16881 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
16883 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
16884 Info.ptrVal = I.getArgOperand(2);
16885 Info.offset = 0;
16886 Info.align = DL.getABITypeAlign(ElTy);
16888 return true;
16889 }
16890 case Intrinsic::aarch64_mops_memset_tag: {
16891 Value *Dst = I.getArgOperand(0);
16892 Value *Val = I.getArgOperand(1);
16894 Info.memVT = MVT::getVT(Val->getType());
16895 Info.ptrVal = Dst;
16896 Info.offset = 0;
16897 Info.align = I.getParamAlign(0).valueOrOne();
16899 // The size of the memory being operated on is unknown at this point
16901 return true;
16902 }
16903 default:
16904 break;
16905 }
16906
16907 return false;
16908}
16909
16911 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
16912 std::optional<unsigned> ByteOffset) const {
16913 // TODO: This may be worth removing. Check regression tests for diffs.
16914 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
16915 ByteOffset))
16916 return false;
16917
16918 // If we're reducing the load width in order to avoid having to use an extra
16919 // instruction to do extension then it's probably a good idea.
16920 if (ExtTy != ISD::NON_EXTLOAD)
16921 return true;
16922 // Don't reduce load width if it would prevent us from combining a shift into
16923 // the offset.
16924 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
16925 assert(Mem);
16926 const SDValue &Base = Mem->getBasePtr();
16927 if (Base.getOpcode() == ISD::ADD &&
16928 Base.getOperand(1).getOpcode() == ISD::SHL &&
16929 Base.getOperand(1).hasOneUse() &&
16930 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
16931 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16932 if (Mem->getMemoryVT().isScalableVector())
16933 return false;
16934 // The shift can be combined if it matches the size of the value being
16935 // loaded (and so reducing the width would make it not match).
16936 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
16937 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16938 if (ShiftAmount == Log2_32(LoadBytes))
16939 return false;
16940 }
16941 // We have no reason to disallow reducing the load width, so allow it.
16942 return true;
16943}
16944
16945// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16947 EVT VT = Extend.getValueType();
16948 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16949 SDValue Extract = Extend.getOperand(0);
16950 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16951 Extract = Extract.getOperand(0);
16952 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16953 EVT VecVT = Extract.getOperand(0).getValueType();
16954 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16955 return false;
16956 }
16957 }
16958 return true;
16959}
16960
16961// Truncations from 64-bit GPR to 32-bit GPR is free.
16963 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16964 return false;
16965 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16966 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16967 return NumBits1 > NumBits2;
16968}
16970 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16971 return false;
16972 uint64_t NumBits1 = VT1.getFixedSizeInBits();
16973 uint64_t NumBits2 = VT2.getFixedSizeInBits();
16974 return NumBits1 > NumBits2;
16975}
16976
16977/// Check if it is profitable to hoist instruction in then/else to if.
16978/// Not profitable if I and it's user can form a FMA instruction
16979/// because we prefer FMSUB/FMADD.
16981 if (I->getOpcode() != Instruction::FMul)
16982 return true;
16983
16984 if (!I->hasOneUse())
16985 return true;
16986
16987 Instruction *User = I->user_back();
16988
16989 if (!(User->getOpcode() == Instruction::FSub ||
16990 User->getOpcode() == Instruction::FAdd))
16991 return true;
16992
16994 const Function *F = I->getFunction();
16995 const DataLayout &DL = F->getDataLayout();
16996 Type *Ty = User->getOperand(0)->getType();
16997
16998 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17000 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17001 I->getFastMathFlags().allowContract()));
17002}
17003
17004// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17005// 64-bit GPR.
17007 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17008 return false;
17009 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17010 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17011 return NumBits1 == 32 && NumBits2 == 64;
17012}
17014 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17015 return false;
17016 unsigned NumBits1 = VT1.getSizeInBits();
17017 unsigned NumBits2 = VT2.getSizeInBits();
17018 return NumBits1 == 32 && NumBits2 == 64;
17019}
17020
17022 EVT VT1 = Val.getValueType();
17023 if (isZExtFree(VT1, VT2)) {
17024 return true;
17025 }
17026
17027 if (Val.getOpcode() != ISD::LOAD)
17028 return false;
17029
17030 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17031 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17032 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17033 VT1.getSizeInBits() <= 32);
17034}
17035
17036bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17037 if (isa<FPExtInst>(Ext))
17038 return false;
17039
17040 // Vector types are not free.
17041 if (Ext->getType()->isVectorTy())
17042 return false;
17043
17044 for (const Use &U : Ext->uses()) {
17045 // The extension is free if we can fold it with a left shift in an
17046 // addressing mode or an arithmetic operation: add, sub, and cmp.
17047
17048 // Is there a shift?
17049 const Instruction *Instr = cast<Instruction>(U.getUser());
17050
17051 // Is this a constant shift?
17052 switch (Instr->getOpcode()) {
17053 case Instruction::Shl:
17054 if (!isa<ConstantInt>(Instr->getOperand(1)))
17055 return false;
17056 break;
17057 case Instruction::GetElementPtr: {
17058 gep_type_iterator GTI = gep_type_begin(Instr);
17059 auto &DL = Ext->getDataLayout();
17060 std::advance(GTI, U.getOperandNo()-1);
17061 Type *IdxTy = GTI.getIndexedType();
17062 // This extension will end up with a shift because of the scaling factor.
17063 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17064 // Get the shift amount based on the scaling factor:
17065 // log2(sizeof(IdxTy)) - log2(8).
17066 if (IdxTy->isScalableTy())
17067 return false;
17068 uint64_t ShiftAmt =
17069 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17070 3;
17071 // Is the constant foldable in the shift of the addressing mode?
17072 // I.e., shift amount is between 1 and 4 inclusive.
17073 if (ShiftAmt == 0 || ShiftAmt > 4)
17074 return false;
17075 break;
17076 }
17077 case Instruction::Trunc:
17078 // Check if this is a noop.
17079 // trunc(sext ty1 to ty2) to ty1.
17080 if (Instr->getType() == Ext->getOperand(0)->getType())
17081 continue;
17082 [[fallthrough]];
17083 default:
17084 return false;
17085 }
17086
17087 // At this point we can use the bfm family, so this extension is free
17088 // for that use.
17089 }
17090 return true;
17091}
17092
17093static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17094 unsigned NumElts, bool IsLittleEndian,
17095 SmallVectorImpl<int> &Mask) {
17096 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17097 return false;
17098
17099 assert(DstWidth % SrcWidth == 0 &&
17100 "TBL lowering is not supported for a conversion instruction with this "
17101 "source and destination element type.");
17102
17103 unsigned Factor = DstWidth / SrcWidth;
17104 unsigned MaskLen = NumElts * Factor;
17105
17106 Mask.clear();
17107 Mask.resize(MaskLen, NumElts);
17108
17109 unsigned SrcIndex = 0;
17110 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17111 Mask[I] = SrcIndex++;
17112
17113 return true;
17114}
17115
17117 FixedVectorType *ZExtTy,
17118 FixedVectorType *DstTy,
17119 bool IsLittleEndian) {
17120 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17121 unsigned NumElts = SrcTy->getNumElements();
17122 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17123 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17124
17125 SmallVector<int> Mask;
17126 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17127 return nullptr;
17128
17129 auto *FirstEltZero = Builder.CreateInsertElement(
17130 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17131 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17132 Result = Builder.CreateBitCast(Result, DstTy);
17133 if (DstTy != ZExtTy)
17134 Result = Builder.CreateZExt(Result, ZExtTy);
17135 return Result;
17136}
17137
17139 FixedVectorType *DstTy,
17140 bool IsLittleEndian) {
17141 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17142 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17143 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17144
17145 SmallVector<int> Mask;
17146 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17147 !IsLittleEndian, Mask))
17148 return nullptr;
17149
17150 auto *FirstEltZero = Builder.CreateInsertElement(
17151 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17152
17153 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17154}
17155
17156static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17157 IRBuilder<> Builder(TI);
17159 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17160 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17161 auto *DstTy = cast<FixedVectorType>(TI->getType());
17162 assert(SrcTy->getElementType()->isIntegerTy() &&
17163 "Non-integer type source vector element is not supported");
17164 assert(DstTy->getElementType()->isIntegerTy(8) &&
17165 "Unsupported destination vector element type");
17166 unsigned SrcElemTySz =
17167 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17168 unsigned DstElemTySz =
17169 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17170 assert((SrcElemTySz % DstElemTySz == 0) &&
17171 "Cannot lower truncate to tbl instructions for a source element size "
17172 "that is not divisible by the destination element size");
17173 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17174 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17175 "Unsupported source vector element type size");
17176 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17177
17178 // Create a mask to choose every nth byte from the source vector table of
17179 // bytes to create the truncated destination vector, where 'n' is the truncate
17180 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17181 // 0,8,16,..Y*8th bytes for the little-endian format
17183 for (int Itr = 0; Itr < 16; Itr++) {
17184 if (Itr < NumElements)
17185 MaskConst.push_back(Builder.getInt8(
17186 IsLittleEndian ? Itr * TruncFactor
17187 : Itr * TruncFactor + (TruncFactor - 1)));
17188 else
17189 MaskConst.push_back(Builder.getInt8(255));
17190 }
17191
17192 int MaxTblSz = 128 * 4;
17193 int MaxSrcSz = SrcElemTySz * NumElements;
17194 int ElemsPerTbl =
17195 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17196 assert(ElemsPerTbl <= 16 &&
17197 "Maximum elements selected using TBL instruction cannot exceed 16!");
17198
17199 int ShuffleCount = 128 / SrcElemTySz;
17200 SmallVector<int> ShuffleLanes;
17201 for (int i = 0; i < ShuffleCount; ++i)
17202 ShuffleLanes.push_back(i);
17203
17204 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17205 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17206 // call TBL & save the result in a vector of TBL results for combining later.
17208 while (ShuffleLanes.back() < NumElements) {
17209 Parts.push_back(Builder.CreateBitCast(
17210 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17211
17212 if (Parts.size() == 4) {
17213 Parts.push_back(ConstantVector::get(MaskConst));
17214 Results.push_back(
17215 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17216 Parts.clear();
17217 }
17218
17219 for (int i = 0; i < ShuffleCount; ++i)
17220 ShuffleLanes[i] += ShuffleCount;
17221 }
17222
17223 assert((Parts.empty() || Results.empty()) &&
17224 "Lowering trunc for vectors requiring different TBL instructions is "
17225 "not supported!");
17226 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17227 // registers
17228 if (!Parts.empty()) {
17229 Intrinsic::ID TblID;
17230 switch (Parts.size()) {
17231 case 1:
17232 TblID = Intrinsic::aarch64_neon_tbl1;
17233 break;
17234 case 2:
17235 TblID = Intrinsic::aarch64_neon_tbl2;
17236 break;
17237 case 3:
17238 TblID = Intrinsic::aarch64_neon_tbl3;
17239 break;
17240 }
17241
17242 Parts.push_back(ConstantVector::get(MaskConst));
17243 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17244 }
17245
17246 // Extract the destination vector from TBL result(s) after combining them
17247 // where applicable. Currently, at most two TBLs are supported.
17248 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17249 "more than 2 tbl instructions!");
17250 Value *FinalResult = Results[0];
17251 if (Results.size() == 1) {
17252 if (ElemsPerTbl < 16) {
17253 SmallVector<int> FinalMask(ElemsPerTbl);
17254 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17255 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17256 }
17257 } else {
17258 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17259 if (ElemsPerTbl < 16) {
17260 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17261 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17262 } else {
17263 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17264 }
17265 FinalResult =
17266 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17267 }
17268
17269 TI->replaceAllUsesWith(FinalResult);
17270 TI->eraseFromParent();
17271}
17272
17274 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17275 // shuffle_vector instructions are serialized when targeting SVE,
17276 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17277 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17278 return false;
17279
17280 // Try to optimize conversions using tbl. This requires materializing constant
17281 // index vectors, which can increase code size and add loads. Skip the
17282 // transform unless the conversion is in a loop block guaranteed to execute
17283 // and we are not optimizing for size.
17284 Function *F = I->getParent()->getParent();
17285 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17286 return false;
17287
17288 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17289 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17290 if (!SrcTy || !DstTy)
17291 return false;
17292
17293 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17294 // lowered to tbl instructions to insert the original i8 elements
17295 // into i8x lanes. This is enabled for cases where it is beneficial.
17296 auto *ZExt = dyn_cast<ZExtInst>(I);
17297 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17298 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17299 if (DstWidth % 8 != 0)
17300 return false;
17301
17302 auto *TruncDstType =
17303 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
17304 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17305 // the remaining ZExt folded into the user, don't use tbl lowering.
17306 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17307 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17310 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17311 return false;
17312
17313 DstTy = TruncDstType;
17314 }
17315
17316 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17317 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17318 // most one extra extend step is needed and using tbl is not profitable.
17319 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17320 // udot instruction.
17321 if (SrcWidth * 4 <= DstWidth) {
17322 if (all_of(I->users(), [&](auto *U) {
17323 auto *SingleUser = cast<Instruction>(&*U);
17324 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17325 return true;
17326 if (match(SingleUser,
17327 m_Intrinsic<
17328 Intrinsic::experimental_vector_partial_reduce_add>(
17329 m_Value(), m_Specific(I))))
17330 return true;
17331 return false;
17332 }))
17333 return false;
17334 }
17335
17336 if (DstTy->getScalarSizeInBits() >= 64)
17337 return false;
17338
17339 IRBuilder<> Builder(ZExt);
17341 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17342 DstTy, Subtarget->isLittleEndian());
17343 if (!Result)
17344 return false;
17345 ZExt->replaceAllUsesWith(Result);
17346 ZExt->eraseFromParent();
17347 return true;
17348 }
17349
17350 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17351 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17352 DstTy->getElementType()->isFloatTy()) ||
17353 (SrcTy->getElementType()->isIntegerTy(16) &&
17354 DstTy->getElementType()->isDoubleTy()))) {
17355 IRBuilder<> Builder(I);
17357 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17358 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17359 assert(ZExt && "Cannot fail for the i8 to float conversion");
17360 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17361 I->replaceAllUsesWith(UI);
17362 I->eraseFromParent();
17363 return true;
17364 }
17365
17366 auto *SIToFP = dyn_cast<SIToFPInst>(I);
17367 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
17368 DstTy->getElementType()->isFloatTy()) {
17369 IRBuilder<> Builder(I);
17370 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
17372 Subtarget->isLittleEndian());
17373 assert(Shuffle && "Cannot fail for the i8 to float conversion");
17374 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
17375 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
17376 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
17377 I->replaceAllUsesWith(SI);
17378 I->eraseFromParent();
17379 return true;
17380 }
17381
17382 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
17383 // followed by a truncate lowered to using tbl.4.
17384 auto *FPToUI = dyn_cast<FPToUIInst>(I);
17385 if (FPToUI &&
17386 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
17387 SrcTy->getElementType()->isFloatTy() &&
17388 DstTy->getElementType()->isIntegerTy(8)) {
17389 IRBuilder<> Builder(I);
17390 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
17391 VectorType::getInteger(SrcTy));
17392 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
17393 I->replaceAllUsesWith(TruncI);
17394 I->eraseFromParent();
17395 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
17396 return true;
17397 }
17398
17399 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17400 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17401 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17402 // registers
17403 auto *TI = dyn_cast<TruncInst>(I);
17404 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17405 ((SrcTy->getElementType()->isIntegerTy(32) ||
17406 SrcTy->getElementType()->isIntegerTy(64)) &&
17407 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17408 createTblForTrunc(TI, Subtarget->isLittleEndian());
17409 return true;
17410 }
17411
17412 return false;
17413}
17414
17416 Align &RequiredAlignment) const {
17417 if (!LoadedType.isSimple() ||
17418 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17419 return false;
17420 // Cyclone supports unaligned accesses.
17421 RequiredAlignment = Align(1);
17422 unsigned NumBits = LoadedType.getSizeInBits();
17423 return NumBits == 32 || NumBits == 64;
17424}
17425
17426/// A helper function for determining the number of interleaved accesses we
17427/// will generate when lowering accesses of the given type.
17429 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17430 unsigned VecSize = 128;
17431 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17432 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17433 if (UseScalable && isa<FixedVectorType>(VecTy))
17434 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17435 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17436}
17437
17440 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17441 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17442 return MOStridedAccess;
17444}
17445
17447 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17448 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17449 auto EC = VecTy->getElementCount();
17450 unsigned MinElts = EC.getKnownMinValue();
17451
17452 UseScalable = false;
17453
17454 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17455 (!Subtarget->useSVEForFixedLengthVectors() ||
17457 return false;
17458
17459 if (isa<ScalableVectorType>(VecTy) &&
17460 !Subtarget->isSVEorStreamingSVEAvailable())
17461 return false;
17462
17463 // Ensure the number of vector elements is greater than 1.
17464 if (MinElts < 2)
17465 return false;
17466
17467 // Ensure the element type is legal.
17468 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17469 return false;
17470
17471 if (EC.isScalable()) {
17472 UseScalable = true;
17473 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17474 }
17475
17476 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17477 if (Subtarget->useSVEForFixedLengthVectors()) {
17478 unsigned MinSVEVectorSize =
17479 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17480 if (VecSize % MinSVEVectorSize == 0 ||
17481 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17482 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17483 UseScalable = true;
17484 return true;
17485 }
17486 }
17487
17488 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17489 // 128 will be split into multiple interleaved accesses.
17490 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17491}
17492
17494 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17495 return ScalableVectorType::get(VTy->getElementType(), 2);
17496
17497 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17498 return ScalableVectorType::get(VTy->getElementType(), 4);
17499
17500 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17501 return ScalableVectorType::get(VTy->getElementType(), 8);
17502
17503 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17504 return ScalableVectorType::get(VTy->getElementType(), 8);
17505
17506 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17507 return ScalableVectorType::get(VTy->getElementType(), 2);
17508
17509 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17510 return ScalableVectorType::get(VTy->getElementType(), 4);
17511
17512 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17513 return ScalableVectorType::get(VTy->getElementType(), 8);
17514
17515 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17516 return ScalableVectorType::get(VTy->getElementType(), 16);
17517
17518 llvm_unreachable("Cannot handle input vector type");
17519}
17520
17521static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17522 bool Scalable, Type *LDVTy,
17523 Type *PtrTy) {
17524 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17525 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17526 Intrinsic::aarch64_sve_ld3_sret,
17527 Intrinsic::aarch64_sve_ld4_sret};
17528 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17529 Intrinsic::aarch64_neon_ld3,
17530 Intrinsic::aarch64_neon_ld4};
17531 if (Scalable)
17532 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17533
17534 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17535 {LDVTy, PtrTy});
17536}
17537
17538static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17539 bool Scalable, Type *STVTy,
17540 Type *PtrTy) {
17541 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17542 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17543 Intrinsic::aarch64_sve_st3,
17544 Intrinsic::aarch64_sve_st4};
17545 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17546 Intrinsic::aarch64_neon_st3,
17547 Intrinsic::aarch64_neon_st4};
17548 if (Scalable)
17549 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17550
17551 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17552 {STVTy, PtrTy});
17553}
17554
17555/// Lower an interleaved load into a ldN intrinsic.
17556///
17557/// E.g. Lower an interleaved load (Factor = 2):
17558/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17559/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17560/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17561///
17562/// Into:
17563/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17564/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17565/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17567 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
17568 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
17569 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17570 "Invalid interleave factor");
17571 assert(!Shuffles.empty() && "Empty shufflevector input");
17572 assert(Shuffles.size() == Indices.size() &&
17573 "Unmatched number of shufflevectors and indices");
17574
17575 auto *LI = dyn_cast<LoadInst>(Load);
17576 if (!LI)
17577 return false;
17578 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
17579
17580 const DataLayout &DL = LI->getDataLayout();
17581
17582 VectorType *VTy = Shuffles[0]->getType();
17583
17584 // Skip if we do not have NEON and skip illegal vector types. We can
17585 // "legalize" wide vector types into multiple interleaved accesses as long as
17586 // the vector types are divisible by 128.
17587 bool UseScalable;
17588 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17589 return false;
17590
17591 // Check if the interleave is a zext(shuffle), that can be better optimized
17592 // into shift / and masks. For the moment we do this just for uitofp (not
17593 // zext) to avoid issues with widening instructions.
17594 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17595 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17596 SI->getType()->getScalarSizeInBits() * 4 ==
17597 SI->user_back()->getType()->getScalarSizeInBits();
17598 }))
17599 return false;
17600
17601 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17602
17603 auto *FVTy = cast<FixedVectorType>(VTy);
17604
17605 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17606 // load integer vectors first and then convert to pointer vectors.
17607 Type *EltTy = FVTy->getElementType();
17608 if (EltTy->isPointerTy())
17609 FVTy =
17610 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17611
17612 // If we're going to generate more than one load, reset the sub-vector type
17613 // to something legal.
17614 FVTy = FixedVectorType::get(FVTy->getElementType(),
17615 FVTy->getNumElements() / NumLoads);
17616
17617 auto *LDVTy =
17618 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17619
17620 IRBuilder<> Builder(LI);
17621
17622 // The base address of the load.
17623 Value *BaseAddr = LI->getPointerOperand();
17624
17625 Type *PtrTy = LI->getPointerOperandType();
17626 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17627 LDVTy->getElementCount());
17628
17629 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17630 UseScalable, LDVTy, PtrTy);
17631
17632 // Holds sub-vectors extracted from the load intrinsic return values. The
17633 // sub-vectors are associated with the shufflevector instructions they will
17634 // replace.
17636
17637 Value *PTrue = nullptr;
17638 if (UseScalable) {
17639 std::optional<unsigned> PgPattern =
17640 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17641 if (Subtarget->getMinSVEVectorSizeInBits() ==
17642 Subtarget->getMaxSVEVectorSizeInBits() &&
17643 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17644 PgPattern = AArch64SVEPredPattern::all;
17645
17646 auto *PTruePat =
17647 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17648 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17649 {PTruePat});
17650 }
17651
17652 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17653
17654 // If we're generating more than one load, compute the base address of
17655 // subsequent loads as an offset from the previous.
17656 if (LoadCount > 0)
17657 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17658 FVTy->getNumElements() * Factor);
17659
17660 CallInst *LdN;
17661 if (UseScalable)
17662 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17663 else
17664 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17665
17666 // Extract and store the sub-vectors returned by the load intrinsic.
17667 for (unsigned i = 0; i < Shuffles.size(); i++) {
17668 ShuffleVectorInst *SVI = Shuffles[i];
17669 unsigned Index = Indices[i];
17670
17671 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17672
17673 if (UseScalable)
17674 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
17675
17676 // Convert the integer vector to pointer vector if the element is pointer.
17677 if (EltTy->isPointerTy())
17678 SubVec = Builder.CreateIntToPtr(
17680 FVTy->getNumElements()));
17681
17682 SubVecs[SVI].push_back(SubVec);
17683 }
17684 }
17685
17686 // Replace uses of the shufflevector instructions with the sub-vectors
17687 // returned by the load intrinsic. If a shufflevector instruction is
17688 // associated with more than one sub-vector, those sub-vectors will be
17689 // concatenated into a single wide vector.
17690 for (ShuffleVectorInst *SVI : Shuffles) {
17691 auto &SubVec = SubVecs[SVI];
17692 auto *WideVec =
17693 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17694 SVI->replaceAllUsesWith(WideVec);
17695 }
17696
17697 return true;
17698}
17699
17700template <typename Iter>
17701bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17702 int MaxLookupDist = 20;
17703 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17704 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17705 const Value *PtrA1 =
17706 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17707
17708 while (++It != End) {
17709 if (It->isDebugOrPseudoInst())
17710 continue;
17711 if (MaxLookupDist-- == 0)
17712 break;
17713 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17714 const Value *PtrB1 =
17715 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17716 DL, OffsetB);
17717 if (PtrA1 == PtrB1 &&
17718 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17719 .abs() == 16)
17720 return true;
17721 }
17722 }
17723
17724 return false;
17725}
17726
17727/// Lower an interleaved store into a stN intrinsic.
17728///
17729/// E.g. Lower an interleaved store (Factor = 3):
17730/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17731/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17732/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17733///
17734/// Into:
17735/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17736/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17737/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17738/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17739///
17740/// Note that the new shufflevectors will be removed and we'll only generate one
17741/// st3 instruction in CodeGen.
17742///
17743/// Example for a more general valid mask (Factor 3). Lower:
17744/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17745/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17746/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17747///
17748/// Into:
17749/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17750/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17751/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17752/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17754 Value *LaneMask,
17755 ShuffleVectorInst *SVI,
17756 unsigned Factor,
17757 const APInt &GapMask) const {
17758
17759 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17760 "Invalid interleave factor");
17761 auto *SI = dyn_cast<StoreInst>(Store);
17762 if (!SI)
17763 return false;
17764 assert(!LaneMask && GapMask.popcount() == Factor &&
17765 "Unexpected mask on store");
17766
17767 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17768 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17769
17770 unsigned LaneLen = VecTy->getNumElements() / Factor;
17771 Type *EltTy = VecTy->getElementType();
17772 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17773
17774 const DataLayout &DL = SI->getDataLayout();
17775 bool UseScalable;
17776
17777 // Skip if we do not have NEON and skip illegal vector types. We can
17778 // "legalize" wide vector types into multiple interleaved accesses as long as
17779 // the vector types are divisible by 128.
17780 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17781 return false;
17782
17783 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17784
17785 Value *Op0 = SVI->getOperand(0);
17786 Value *Op1 = SVI->getOperand(1);
17787 IRBuilder<> Builder(SI);
17788
17789 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17790 // vectors to integer vectors.
17791 if (EltTy->isPointerTy()) {
17792 Type *IntTy = DL.getIntPtrType(EltTy);
17793 unsigned NumOpElts =
17794 cast<FixedVectorType>(Op0->getType())->getNumElements();
17795
17796 // Convert to the corresponding integer vector.
17797 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17798 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17799 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17800
17801 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17802 }
17803
17804 // If we're going to generate more than one store, reset the lane length
17805 // and sub-vector type to something legal.
17806 LaneLen /= NumStores;
17807 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17808
17809 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17810 : SubVecTy;
17811
17812 // The base address of the store.
17813 Value *BaseAddr = SI->getPointerOperand();
17814
17815 auto Mask = SVI->getShuffleMask();
17816
17817 // Sanity check if all the indices are NOT in range.
17818 // If mask is `poison`, `Mask` may be a vector of -1s.
17819 // If all of them are `poison`, OOB read will happen later.
17820 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17821 return false;
17822 }
17823 // A 64bit st2 which does not start at element 0 will involved adding extra
17824 // ext elements making the st2 unprofitable, and if there is a nearby store
17825 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17826 // zip;ldp pair which has higher throughput.
17827 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17828 (Mask[0] != 0 ||
17829 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17830 DL) ||
17831 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17832 BaseAddr, DL)))
17833 return false;
17834
17835 Type *PtrTy = SI->getPointerOperandType();
17836 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17837 STVTy->getElementCount());
17838
17839 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17840 UseScalable, STVTy, PtrTy);
17841
17842 Value *PTrue = nullptr;
17843 if (UseScalable) {
17844 std::optional<unsigned> PgPattern =
17845 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17846 if (Subtarget->getMinSVEVectorSizeInBits() ==
17847 Subtarget->getMaxSVEVectorSizeInBits() &&
17848 Subtarget->getMinSVEVectorSizeInBits() ==
17849 DL.getTypeSizeInBits(SubVecTy))
17850 PgPattern = AArch64SVEPredPattern::all;
17851
17852 auto *PTruePat =
17853 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
17854 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17855 {PTruePat});
17856 }
17857
17858 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17859
17861
17862 // Split the shufflevector operands into sub vectors for the new stN call.
17863 for (unsigned i = 0; i < Factor; i++) {
17864 Value *Shuffle;
17865 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17866 if (Mask[IdxI] >= 0) {
17867 Shuffle = Builder.CreateShuffleVector(
17868 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17869 } else {
17870 unsigned StartMask = 0;
17871 for (unsigned j = 1; j < LaneLen; j++) {
17872 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17873 if (Mask[IdxJ] >= 0) {
17874 StartMask = Mask[IdxJ] - j;
17875 break;
17876 }
17877 }
17878 // Note: Filling undef gaps with random elements is ok, since
17879 // those elements were being written anyway (with undefs).
17880 // In the case of all undefs we're defaulting to using elems from 0
17881 // Note: StartMask cannot be negative, it's checked in
17882 // isReInterleaveMask
17883 Shuffle = Builder.CreateShuffleVector(
17884 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17885 }
17886
17887 if (UseScalable)
17888 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
17889 Shuffle, uint64_t(0));
17890
17891 Ops.push_back(Shuffle);
17892 }
17893
17894 if (UseScalable)
17895 Ops.push_back(PTrue);
17896
17897 // If we generating more than one store, we compute the base address of
17898 // subsequent stores as an offset from the previous.
17899 if (StoreCount > 0)
17900 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17901 BaseAddr, LaneLen * Factor);
17902
17903 Ops.push_back(BaseAddr);
17904 Builder.CreateCall(StNFunc, Ops);
17905 }
17906 return true;
17907}
17908
17910 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
17911 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
17912 if (Factor != 2 && Factor != 4) {
17913 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17914 return false;
17915 }
17916 auto *LI = dyn_cast<LoadInst>(Load);
17917 if (!LI)
17918 return false;
17919 assert(!Mask && "Unexpected mask on a load\n");
17920
17922
17923 const DataLayout &DL = LI->getModule()->getDataLayout();
17924 bool UseScalable;
17925 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17926 return false;
17927
17928 // TODO: Add support for using SVE instructions with fixed types later, using
17929 // the code from lowerInterleavedLoad to obtain the correct container type.
17930 if (UseScalable && !VTy->isScalableTy())
17931 return false;
17932
17933 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17934 VectorType *LdTy =
17936 VTy->getElementCount().divideCoefficientBy(NumLoads));
17937
17938 Type *PtrTy = LI->getPointerOperandType();
17939 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17940 UseScalable, LdTy, PtrTy);
17941
17942 IRBuilder<> Builder(LI);
17943 Value *Pred = nullptr;
17944 if (UseScalable)
17945 Pred =
17946 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17947
17948 Value *BaseAddr = LI->getPointerOperand();
17949 Value *Result = nullptr;
17950 if (NumLoads > 1) {
17951 // Create multiple legal small ldN.
17952 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
17953 for (unsigned I = 0; I < NumLoads; ++I) {
17954 Value *Offset = Builder.getInt64(I * Factor);
17955
17956 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17957 Value *LdN = nullptr;
17958 if (UseScalable)
17959 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17960 else
17961 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17962 Value *Idx =
17963 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17964 for (unsigned J = 0; J < Factor; ++J) {
17965 ExtractedLdValues[J] = Builder.CreateInsertVector(
17966 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
17967 }
17968 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17969 }
17970
17971 // Merge the values from different factors.
17972 Result = PoisonValue::get(DI->getType());
17973 for (unsigned J = 0; J < Factor; ++J)
17974 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
17975 } else {
17976 if (UseScalable)
17977 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17978 else
17979 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17980 }
17981
17982 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17983 DI->replaceAllUsesWith(Result);
17984 return true;
17985}
17986
17988 Instruction *Store, Value *Mask,
17989 ArrayRef<Value *> InterleavedValues) const {
17990 unsigned Factor = InterleavedValues.size();
17991 if (Factor != 2 && Factor != 4) {
17992 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17993 return false;
17994 }
17995 StoreInst *SI = dyn_cast<StoreInst>(Store);
17996 if (!SI)
17997 return false;
17998 assert(!Mask && "Unexpected mask on plain store");
17999
18000 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18001 const DataLayout &DL = SI->getModule()->getDataLayout();
18002
18003 bool UseScalable;
18004 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18005 return false;
18006
18007 // TODO: Add support for using SVE instructions with fixed types later, using
18008 // the code from lowerInterleavedStore to obtain the correct container type.
18009 if (UseScalable && !VTy->isScalableTy())
18010 return false;
18011
18012 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18013
18014 VectorType *StTy =
18016 VTy->getElementCount().divideCoefficientBy(NumStores));
18017
18018 Type *PtrTy = SI->getPointerOperandType();
18019 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18020 UseScalable, StTy, PtrTy);
18021
18022 IRBuilder<> Builder(SI);
18023
18024 Value *BaseAddr = SI->getPointerOperand();
18025 Value *Pred = nullptr;
18026
18027 if (UseScalable)
18028 Pred =
18029 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18030
18031 auto ExtractedValues = InterleavedValues;
18032 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18033 if (UseScalable)
18034 StoreOperands.push_back(Pred);
18035 StoreOperands.push_back(BaseAddr);
18036 for (unsigned I = 0; I < NumStores; ++I) {
18037 Value *Address = BaseAddr;
18038 if (NumStores > 1) {
18039 Value *Offset = Builder.getInt64(I * Factor);
18040 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18041 Value *Idx =
18042 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18043 for (unsigned J = 0; J < Factor; J++) {
18044 StoreOperands[J] =
18045 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18046 }
18047 // update the address
18048 StoreOperands[StoreOperands.size() - 1] = Address;
18049 }
18050 Builder.CreateCall(StNFunc, StoreOperands);
18051 }
18052 return true;
18053}
18054
18056 LLVMContext &Context, const MemOp &Op,
18057 const AttributeList &FuncAttributes) const {
18058 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18059 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18060 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18061 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18062 // taken one instruction to materialize the v2i64 zero and one store (with
18063 // restrictive addressing mode). Just do i64 stores.
18064 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18065 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18066 if (Op.isAligned(AlignCheck))
18067 return true;
18068 unsigned Fast;
18069 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18071 Fast;
18072 };
18073
18074 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18075 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18076 return MVT::v16i8;
18077 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18078 return MVT::f128;
18079 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18080 return MVT::i64;
18081 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18082 return MVT::i32;
18083 return MVT::Other;
18084}
18085
18087 const MemOp &Op, const AttributeList &FuncAttributes) const {
18088 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18089 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18090 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18091 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18092 // taken one instruction to materialize the v2i64 zero and one store (with
18093 // restrictive addressing mode). Just do i64 stores.
18094 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18095 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18096 if (Op.isAligned(AlignCheck))
18097 return true;
18098 unsigned Fast;
18099 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18101 Fast;
18102 };
18103
18104 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18105 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18106 return LLT::fixed_vector(2, 64);
18107 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18108 return LLT::scalar(128);
18109 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18110 return LLT::scalar(64);
18111 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18112 return LLT::scalar(32);
18113 return LLT();
18114}
18115
18116// 12-bit optionally shifted immediates are legal for adds.
18118 if (Immed == std::numeric_limits<int64_t>::min()) {
18119 return false;
18120 }
18121 // Same encoding for add/sub, just flip the sign.
18122 return isLegalArithImmed((uint64_t)std::abs(Immed));
18123}
18124
18126 // We will only emit addvl/inc* instructions for SVE2
18127 if (!Subtarget->hasSVE2())
18128 return false;
18129
18130 // addvl's immediates are in terms of the number of bytes in a register.
18131 // Since there are 16 in the base supported size (128bits), we need to
18132 // divide the immediate by that much to give us a useful immediate to
18133 // multiply by vscale. We can't have a remainder as a result of this.
18134 if (Imm % 16 == 0)
18135 return isInt<6>(Imm / 16);
18136
18137 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18138 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18139 // of addvl as a result, so only take h|w|d into account.
18140 // Dec[h|w|d] will cover subtractions.
18141 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18142 // FIXME: Can we make use of other patterns to cover other immediates?
18143
18144 // inch|dech
18145 if (Imm % 8 == 0)
18146 return std::abs(Imm / 8) <= 16;
18147 // incw|decw
18148 if (Imm % 4 == 0)
18149 return std::abs(Imm / 4) <= 16;
18150 // incd|decd
18151 if (Imm % 2 == 0)
18152 return std::abs(Imm / 2) <= 16;
18153
18154 return false;
18155}
18156
18157// Return false to prevent folding
18158// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18159// if the folding leads to worse code.
18161 SDValue AddNode, SDValue ConstNode) const {
18162 // Let the DAGCombiner decide for vector types and large types.
18163 const EVT VT = AddNode.getValueType();
18164 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18165 return true;
18166
18167 // It is worse if c1 is legal add immediate, while c1*c2 is not
18168 // and has to be composed by at least two instructions.
18169 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18170 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18171 const int64_t C1 = C1Node->getSExtValue();
18172 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18174 return true;
18176 // Adapt to the width of a register.
18177 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18178 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18179 if (Insn.size() > 1)
18180 return false;
18181
18182 // Default to true and let the DAGCombiner decide.
18183 return true;
18184}
18185
18186// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18187// immediates is the same as for an add or a sub.
18189 return isLegalAddImmediate(Immed);
18190}
18191
18192/// isLegalAddressingMode - Return true if the addressing mode represented
18193/// by AM is legal for this target, for a load/store of the specified type.
18195 const AddrMode &AMode, Type *Ty,
18196 unsigned AS, Instruction *I) const {
18197 // AArch64 has five basic addressing modes:
18198 // reg
18199 // reg + 9-bit signed offset
18200 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18201 // reg1 + reg2
18202 // reg + SIZE_IN_BYTES * reg
18203
18204 // No global is ever allowed as a base.
18205 if (AMode.BaseGV)
18206 return false;
18207
18208 // No reg+reg+imm addressing.
18209 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18210 return false;
18211
18212 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18213 // `2*ScaledReg` into `BaseReg + ScaledReg`
18214 AddrMode AM = AMode;
18215 if (AM.Scale && !AM.HasBaseReg) {
18216 if (AM.Scale == 1) {
18217 AM.HasBaseReg = true;
18218 AM.Scale = 0;
18219 } else if (AM.Scale == 2) {
18220 AM.HasBaseReg = true;
18221 AM.Scale = 1;
18222 } else {
18223 return false;
18224 }
18225 }
18226
18227 // A base register is required in all addressing modes.
18228 if (!AM.HasBaseReg)
18229 return false;
18230
18231 if (Ty->isScalableTy()) {
18232 if (isa<ScalableVectorType>(Ty)) {
18233 // See if we have a foldable vscale-based offset, for vector types which
18234 // are either legal or smaller than the minimum; more work will be
18235 // required if we need to consider addressing for types which need
18236 // legalization by splitting.
18237 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18238 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18239 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18240 isPowerOf2_64(VecNumBytes))
18241 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18242
18243 uint64_t VecElemNumBytes =
18244 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18245 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18246 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18247 }
18248
18249 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18250 }
18251
18252 // No scalable offsets allowed for non-scalable types.
18253 if (AM.ScalableOffset)
18254 return false;
18255
18256 // check reg + imm case:
18257 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18258 uint64_t NumBytes = 0;
18259 if (Ty->isSized()) {
18260 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18261 NumBytes = NumBits / 8;
18262 if (!isPowerOf2_64(NumBits))
18263 NumBytes = 0;
18264 }
18265
18266 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18267 AM.Scale);
18268}
18269
18270// Check whether the 2 offsets belong to the same imm24 range, and their high
18271// 12bits are same, then their high part can be decoded with the offset of add.
18272int64_t
18274 int64_t MaxOffset) const {
18275 int64_t HighPart = MinOffset & ~0xfffULL;
18276 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18277 // Rebase the value to an integer multiple of imm12.
18278 return HighPart;
18279 }
18280
18281 return 0;
18282}
18283
18285 // Consider splitting large offset of struct or array.
18286 return true;
18287}
18288
18290 const MachineFunction &MF, EVT VT) const {
18291 EVT ScalarVT = VT.getScalarType();
18292
18293 if (!ScalarVT.isSimple())
18294 return false;
18295
18296 switch (ScalarVT.getSimpleVT().SimpleTy) {
18297 case MVT::f16:
18298 return Subtarget->hasFullFP16();
18299 case MVT::f32:
18300 case MVT::f64:
18301 return true;
18302 case MVT::bf16:
18303 return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18305 default:
18306 break;
18307 }
18308
18309 return false;
18310}
18311
18313 Type *Ty) const {
18314 switch (Ty->getScalarType()->getTypeID()) {
18315 case Type::FloatTyID:
18316 case Type::DoubleTyID:
18317 return true;
18318 default:
18319 return false;
18320 }
18321}
18322
18324 EVT VT, CodeGenOptLevel OptLevel) const {
18325 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18327}
18328
18329const MCPhysReg *
18331 // LR is a callee-save register, but we must treat it as clobbered by any call
18332 // site. Hence we include LR in the scratch registers, which are in turn added
18333 // as implicit-defs for stackmaps and patchpoints.
18334 static const MCPhysReg ScratchRegs[] = {
18335 AArch64::X16, AArch64::X17, AArch64::LR, 0
18336 };
18337 return ScratchRegs;
18338}
18339
18341 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18342 return RCRegs;
18343}
18344
18345bool
18347 CombineLevel Level) const {
18348 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18349 N->getOpcode() == ISD::SRL) &&
18350 "Expected shift op");
18351
18352 SDValue ShiftLHS = N->getOperand(0);
18353 EVT VT = N->getValueType(0);
18354
18355 if (!ShiftLHS->hasOneUse())
18356 return false;
18357
18358 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18359 !ShiftLHS.getOperand(0)->hasOneUse())
18360 return false;
18361
18362 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18363 // combine it with shift 'N' to let it be lowered to UBFX except:
18364 // ((x >> C) & mask) << C.
18365 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18366 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18367 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18368 if (isMask_64(TruncMask)) {
18369 SDValue AndLHS = ShiftLHS.getOperand(0);
18370 if (AndLHS.getOpcode() == ISD::SRL) {
18371 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18372 if (N->getOpcode() == ISD::SHL)
18373 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18374 return SRLC->getZExtValue() == SHLC->getZExtValue();
18375 return false;
18376 }
18377 }
18378 }
18379 }
18380 return true;
18381}
18382
18384 const SDNode *N) const {
18385 assert(N->getOpcode() == ISD::XOR &&
18386 (N->getOperand(0).getOpcode() == ISD::SHL ||
18387 N->getOperand(0).getOpcode() == ISD::SRL) &&
18388 "Expected XOR(SHIFT) pattern");
18389
18390 // Only commute if the entire NOT mask is a hidden shifted mask.
18391 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18392 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18393 if (XorC && ShiftC) {
18394 unsigned MaskIdx, MaskLen;
18395 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18396 unsigned ShiftAmt = ShiftC->getZExtValue();
18397 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18398 if (N->getOperand(0).getOpcode() == ISD::SHL)
18399 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18400 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18401 }
18402 }
18403
18404 return false;
18405}
18406
18408 const SDNode *N, CombineLevel Level) const {
18409 assert(((N->getOpcode() == ISD::SHL &&
18410 N->getOperand(0).getOpcode() == ISD::SRL) ||
18411 (N->getOpcode() == ISD::SRL &&
18412 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18413 "Expected shift-shift mask");
18414 // Don't allow multiuse shift folding with the same shift amount.
18415 if (!N->getOperand(0)->hasOneUse())
18416 return false;
18417
18418 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18419 EVT VT = N->getValueType(0);
18420 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18421 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18422 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18423 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18424 }
18425
18426 // We do not need to fold when this shifting used in specific load case:
18427 // (ldr x, (add x, (shl (srl x, c1) 2)))
18428 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18429 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18430 unsigned ShlAmt = C2->getZExtValue();
18431 if (auto ShouldADD = *N->user_begin();
18432 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18433 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18434 EVT MemVT = Load->getMemoryVT();
18435
18436 if (Load->getValueType(0).isScalableVector())
18437 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
18438
18439 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
18440 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
18441 }
18442 }
18443 }
18444 }
18445
18446 return true;
18447}
18448
18450 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
18451 SDValue Y) const {
18452 return VT.isScalableVector() && isTypeLegal(VT) &&
18453 SelectOpcode == ISD::VSELECT;
18454}
18455
18457 Type *Ty) const {
18458 assert(Ty->isIntegerTy());
18459
18460 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18461 if (BitSize == 0)
18462 return false;
18463
18464 int64_t Val = Imm.getSExtValue();
18465 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18466 return true;
18467
18468 if (Val < 0)
18469 Val = ~Val;
18470 if (BitSize == 32)
18471 Val &= (1LL << 32) - 1;
18472
18473 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18474 // MOVZ is free so return true for one or fewer MOVK.
18475 return Shift < 3;
18476}
18477
18479 unsigned Index) const {
18481 return false;
18482
18483 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18484}
18485
18486/// Turn vector tests of the signbit in the form of:
18487/// xor (sra X, elt_size(X)-1), -1
18488/// into:
18489/// cmge X, X, #0
18491 const AArch64Subtarget *Subtarget) {
18492 EVT VT = N->getValueType(0);
18493 if (!Subtarget->hasNEON() || !VT.isVector())
18494 return SDValue();
18495
18496 // There must be a shift right algebraic before the xor, and the xor must be a
18497 // 'not' operation.
18498 SDValue Shift = N->getOperand(0);
18499 SDValue Ones = N->getOperand(1);
18500 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18502 return SDValue();
18503
18504 // The shift should be smearing the sign bit across each vector element.
18505 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18506 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18507 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18508 return SDValue();
18509
18510 SDLoc DL(N);
18511 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
18512 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
18513}
18514
18515// Given a vecreduce_add node, detect the below pattern and convert it to the
18516// node sequence with UABDL, [S|U]ADB and UADDLP.
18517//
18518// i32 vecreduce_add(
18519// v16i32 abs(
18520// v16i32 sub(
18521// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18522//
18523// or
18524//
18525// i32 vecreduce_add(
18526// v16i32 zext(
18527// v16i16 abs(
18528// v16i16 sub(
18529// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18530//
18531// =================>
18532// i32 vecreduce_add(
18533// v4i32 UADDLP(
18534// v8i16 add(
18535// v8i16 zext(
18536// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18537// v8i16 zext(
18538// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18540 SelectionDAG &DAG) {
18541 // Assumed i32 vecreduce_add
18542 if (N->getValueType(0) != MVT::i32)
18543 return SDValue();
18544
18545 SDValue VecReduceOp0 = N->getOperand(0);
18546 bool SawTrailingZext = false;
18547 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18548 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18549 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18550 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18551 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18552 SawTrailingZext = true;
18553 VecReduceOp0 = VecReduceOp0.getOperand(0);
18554 }
18555
18556 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
18557 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18558 // Assumed v16i16 or v16i32 abs input
18559 unsigned Opcode = VecReduceOp0.getOpcode();
18560 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
18561 return SDValue();
18562
18563 SDValue ABS = VecReduceOp0;
18564 // Assumed v16i16 or v16i32 sub
18565 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18566 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
18567 return SDValue();
18568
18569 SDValue SUB = ABS->getOperand(0);
18570 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18571 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18572 // Assumed v16i16 or v16i32 type
18573 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18574 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
18575 return SDValue();
18576
18577 // Assumed zext or sext
18578 bool IsZExt = false;
18579 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18580 IsZExt = true;
18581 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18582 IsZExt = false;
18583 } else
18584 return SDValue();
18585
18586 SDValue EXT0 = SUB->getOperand(0);
18587 SDValue EXT1 = SUB->getOperand(1);
18588 // Assumed zext's operand has v16i8 type
18589 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18590 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18591 return SDValue();
18592
18593 // Pattern is detected. Let's convert it to sequence of nodes.
18594 SDLoc DL(N);
18595
18596 // First, create the node pattern of UABD/SABD.
18597 SDValue UABDHigh8Op0 =
18598 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18599 DAG.getConstant(8, DL, MVT::i64));
18600 SDValue UABDHigh8Op1 =
18601 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18602 DAG.getConstant(8, DL, MVT::i64));
18603 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18604 UABDHigh8Op0, UABDHigh8Op1);
18605 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18606
18607 // Second, create the node pattern of UABAL.
18608 SDValue UABDLo8Op0 =
18609 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18610 DAG.getConstant(0, DL, MVT::i64));
18611 SDValue UABDLo8Op1 =
18612 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18613 DAG.getConstant(0, DL, MVT::i64));
18614 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18615 UABDLo8Op0, UABDLo8Op1);
18616 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18617 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18618
18619 // Third, create the node of UADDLP.
18620 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18621
18622 // Fourth, create the node of VECREDUCE_ADD.
18623 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18624}
18625
18626static SDValue
18628 const AArch64Subtarget *ST) {
18629 if (DCI.isBeforeLegalize())
18630 return SDValue();
18631
18632 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
18633 /*IsEqual=*/false))
18634 return While;
18635
18636 if (!N->getValueType(0).isScalableVector() ||
18637 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18638 return SDValue();
18639
18640 unsigned NumUses = N->use_size();
18641 auto MaskEC = N->getValueType(0).getVectorElementCount();
18642 if (!MaskEC.isKnownMultipleOf(NumUses))
18643 return SDValue();
18644
18645 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumUses);
18646 if (ExtMinEC.getKnownMinValue() < 2)
18647 return SDValue();
18648
18649 SmallVector<SDNode *> Extracts(NumUses, nullptr);
18650 for (SDNode *Use : N->users()) {
18651 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18652 return SDValue();
18653
18654 // Ensure the extract type is correct (e.g. if NumUses is 4 and
18655 // the mask return type is nxv8i1, each extract should be nxv2i1.
18656 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
18657 return SDValue();
18658
18659 // There should be exactly one extract for each part of the mask.
18660 unsigned Offset = Use->getConstantOperandVal(1);
18661 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
18662 if (Extracts[Part] != nullptr)
18663 return SDValue();
18664
18665 Extracts[Part] = Use;
18666 }
18667
18668 SelectionDAG &DAG = DCI.DAG;
18669 SDLoc DL(N);
18670 SDValue ID =
18671 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
18672
18673 SDValue Idx = N->getOperand(0);
18674 SDValue TC = N->getOperand(1);
18675 EVT OpVT = Idx.getValueType();
18676 if (OpVT != MVT::i64) {
18677 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
18678 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
18679 }
18680
18681 // Create the whilelo_x2 intrinsics from each pair of extracts
18682 EVT ExtVT = Extracts[0]->getValueType(0);
18683 auto R =
18684 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18685 DCI.CombineTo(Extracts[0], R.getValue(0));
18686 DCI.CombineTo(Extracts[1], R.getValue(1));
18687
18688 if (NumUses == 2)
18689 return SDValue(N, 0);
18690
18691 auto Elts = DAG.getElementCount(DL, OpVT, ExtVT.getVectorElementCount() * 2);
18692 for (unsigned I = 2; I < NumUses; I += 2) {
18693 // After the first whilelo_x2, we need to increment the starting value.
18694 Idx = DAG.getNode(ISD::UADDSAT, DL, OpVT, Idx, Elts);
18695 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18696 DCI.CombineTo(Extracts[I], R.getValue(0));
18697 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
18698 }
18699
18700 return SDValue(N, 0);
18701}
18702
18703// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18704// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18705// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18706// If we have vectors larger than v16i8 we extract v16i8 vectors,
18707// Follow the same steps above to get DOT instructions concatenate them
18708// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18710 const AArch64Subtarget *ST) {
18711 if (!ST->isNeonAvailable())
18712 return SDValue();
18713
18714 if (!ST->hasDotProd())
18716
18717 SDValue Op0 = N->getOperand(0);
18718 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18719 Op0.getValueType().getVectorElementType() != MVT::i32)
18720 return SDValue();
18721
18722 unsigned ExtOpcode = Op0.getOpcode();
18723 SDValue A = Op0;
18724 SDValue B;
18725 unsigned DotOpcode;
18726 if (ExtOpcode == ISD::MUL) {
18727 A = Op0.getOperand(0);
18728 B = Op0.getOperand(1);
18729 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18730 return SDValue();
18731 auto OpCodeA = A.getOpcode();
18732 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18733 return SDValue();
18734
18735 auto OpCodeB = B.getOpcode();
18736 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18737 return SDValue();
18738
18739 if (OpCodeA == OpCodeB) {
18740 DotOpcode =
18741 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18742 } else {
18743 // Check USDOT support support
18744 if (!ST->hasMatMulInt8())
18745 return SDValue();
18746 DotOpcode = AArch64ISD::USDOT;
18747 if (OpCodeA == ISD::SIGN_EXTEND)
18748 std::swap(A, B);
18749 }
18750 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18751 DotOpcode = AArch64ISD::UDOT;
18752 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18753 DotOpcode = AArch64ISD::SDOT;
18754 } else {
18755 return SDValue();
18756 }
18757
18758 EVT Op0VT = A.getOperand(0).getValueType();
18759 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18760 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18761 if (!IsValidElementCount || !IsValidSize)
18762 return SDValue();
18763
18764 SDLoc DL(Op0);
18765 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18766 // the extend B.
18767 if (!B)
18768 B = DAG.getConstant(1, DL, Op0VT);
18769 else
18770 B = B.getOperand(0);
18771
18772 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18773 unsigned NumOfVecReduce;
18774 EVT TargetType;
18775 if (IsMultipleOf16) {
18776 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18777 TargetType = MVT::v4i32;
18778 } else {
18779 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18780 TargetType = MVT::v2i32;
18781 }
18782 // Handle the case where we need to generate only one Dot operation.
18783 if (NumOfVecReduce == 1) {
18784 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18785 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18786 A.getOperand(0), B);
18787 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18788 }
18789 // Generate Dot instructions that are multiple of 16.
18790 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18791 SmallVector<SDValue, 4> SDotVec16;
18792 unsigned I = 0;
18793 for (; I < VecReduce16Num; I += 1) {
18794 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18795 SDValue Op0 =
18796 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18797 DAG.getConstant(I * 16, DL, MVT::i64));
18798 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18799 DAG.getConstant(I * 16, DL, MVT::i64));
18800 SDValue Dot =
18801 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18802 SDotVec16.push_back(Dot);
18803 }
18804 // Concatenate dot operations.
18805 EVT SDot16EVT =
18806 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18807 SDValue ConcatSDot16 =
18808 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18809 SDValue VecReduceAdd16 =
18810 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18811 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18812 if (VecReduce8Num == 0)
18813 return VecReduceAdd16;
18814
18815 // Generate the remainder Dot operation that is multiple of 8.
18816 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18817 SDValue Vec8Op0 =
18818 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18819 DAG.getConstant(I * 16, DL, MVT::i64));
18820 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18821 DAG.getConstant(I * 16, DL, MVT::i64));
18822 SDValue Dot =
18823 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18824 SDValue VecReduceAdd8 =
18825 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18826 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18827 VecReduceAdd8);
18828}
18829
18830// Given an (integer) vecreduce, we know the order of the inputs does not
18831// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18832// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18833// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18835 auto DetectAddExtract = [&](SDValue A) {
18836 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18837 // UADDLP(x) if found.
18838 assert(A.getOpcode() == ISD::ADD);
18839 EVT VT = A.getValueType();
18840 SDValue Op0 = A.getOperand(0);
18841 SDValue Op1 = A.getOperand(1);
18842 if (Op0.getOpcode() != Op1.getOpcode() ||
18843 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18844 Op0.getOpcode() != ISD::SIGN_EXTEND))
18845 return SDValue();
18846 SDValue Ext0 = Op0.getOperand(0);
18847 SDValue Ext1 = Op1.getOperand(0);
18848 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18850 Ext0.getOperand(0) != Ext1.getOperand(0))
18851 return SDValue();
18852 // Check that the type is twice the add types, and the extract are from
18853 // upper/lower parts of the same source.
18855 VT.getVectorNumElements() * 2)
18856 return SDValue();
18857 if ((Ext0.getConstantOperandVal(1) != 0 ||
18859 (Ext1.getConstantOperandVal(1) != 0 ||
18861 return SDValue();
18862 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18863 : AArch64ISD::SADDLP;
18864 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
18865 };
18866
18867 if (SDValue R = DetectAddExtract(A))
18868 return R;
18869
18870 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
18871 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
18872 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18873 A.getOperand(1));
18874 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
18875 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
18876 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18877 A.getOperand(0));
18878 return SDValue();
18879}
18880
18881// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18882// UADDLV(concat), where the concat represents the 64-bit zext sources.
18884 // Look for add(zext(64-bit source), zext(64-bit source)), returning
18885 // UADDLV(concat(zext, zext)) if found.
18886 assert(A.getOpcode() == ISD::ADD);
18887 EVT VT = A.getValueType();
18888 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18889 return SDValue();
18890 SDValue Op0 = A.getOperand(0);
18891 SDValue Op1 = A.getOperand(1);
18892 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18893 return SDValue();
18894 SDValue Ext0 = Op0.getOperand(0);
18895 SDValue Ext1 = Op1.getOperand(0);
18896 EVT ExtVT0 = Ext0.getValueType();
18897 EVT ExtVT1 = Ext1.getValueType();
18898 // Check zext VTs are the same and 64-bit length.
18899 if (ExtVT0 != ExtVT1 ||
18900 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18901 return SDValue();
18902 // Get VT for concat of zext sources.
18903 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
18904 SDValue Concat =
18905 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
18906
18907 switch (VT.getSimpleVT().SimpleTy) {
18908 case MVT::v2i64:
18909 case MVT::v4i32:
18910 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
18911 case MVT::v8i16: {
18912 SDValue Uaddlv =
18913 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
18914 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
18915 }
18916 default:
18917 llvm_unreachable("Unhandled vector type");
18918 }
18919}
18920
18922 SDValue A = N->getOperand(0);
18923 if (A.getOpcode() == ISD::ADD) {
18924 if (SDValue R = performUADDVAddCombine(A, DAG))
18925 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
18926 else if (SDValue R = performUADDVZextCombine(A, DAG))
18927 return R;
18928 }
18929 return SDValue();
18930}
18931
18934 const AArch64Subtarget *Subtarget) {
18935 if (DCI.isBeforeLegalizeOps())
18936 return SDValue();
18937
18938 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18939}
18940
18941SDValue
18942AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18943 SelectionDAG &DAG,
18944 SmallVectorImpl<SDNode *> &Created) const {
18946 if (isIntDivCheap(N->getValueType(0), Attr))
18947 return SDValue(N, 0); // Lower SDIV as SDIV
18948
18949 EVT VT = N->getValueType(0);
18950
18951 // If SVE is available, we can generate
18952 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
18953 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
18954 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
18955 return SDValue(N, 0);
18956
18957 // fold (sdiv X, pow2)
18958 if ((VT != MVT::i32 && VT != MVT::i64) ||
18959 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18960 return SDValue();
18961
18962 // If the divisor is 2 or -2, the default expansion is better. It will add
18963 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18964 if (Divisor == 2 ||
18965 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18966 return SDValue();
18967
18968 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18969}
18970
18971SDValue
18972AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18973 SelectionDAG &DAG,
18974 SmallVectorImpl<SDNode *> &Created) const {
18976 if (isIntDivCheap(N->getValueType(0), Attr))
18977 return SDValue(N, 0); // Lower SREM as SREM
18978
18979 EVT VT = N->getValueType(0);
18980
18981 // For scalable and fixed types, mark them as cheap so we can handle it much
18982 // later. This allows us to handle larger than legal types.
18983 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18984 return SDValue(N, 0);
18985
18986 // fold (srem X, pow2)
18987 if ((VT != MVT::i32 && VT != MVT::i64) ||
18988 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18989 return SDValue();
18990
18991 unsigned Lg2 = Divisor.countr_zero();
18992 if (Lg2 == 0)
18993 return SDValue();
18994
18995 SDLoc DL(N);
18996 SDValue N0 = N->getOperand(0);
18997 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
18998 SDValue Zero = DAG.getConstant(0, DL, VT);
18999 SDValue CCVal, CSNeg;
19000 if (Lg2 == 1) {
19001 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19002 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19003 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19004
19005 Created.push_back(Cmp.getNode());
19006 Created.push_back(And.getNode());
19007 } else {
19008 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19009 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19010
19011 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19012 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19013 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19014 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19015 Negs.getValue(1));
19016
19017 Created.push_back(Negs.getNode());
19018 Created.push_back(AndPos.getNode());
19019 Created.push_back(AndNeg.getNode());
19020 }
19021
19022 return CSNeg;
19023}
19024
19025static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
19026 switch(getIntrinsicID(S.getNode())) {
19027 default:
19028 break;
19029 case Intrinsic::aarch64_sve_cntb:
19030 return 8;
19031 case Intrinsic::aarch64_sve_cnth:
19032 return 16;
19033 case Intrinsic::aarch64_sve_cntw:
19034 return 32;
19035 case Intrinsic::aarch64_sve_cntd:
19036 return 64;
19037 }
19038 return {};
19039}
19040
19041/// Calculates what the pre-extend type is, based on the extension
19042/// operation node provided by \p Extend.
19043///
19044/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19045/// pre-extend type is pulled directly from the operand, while other extend
19046/// operations need a bit more inspection to get this information.
19047///
19048/// \param Extend The SDNode from the DAG that represents the extend operation
19049///
19050/// \returns The type representing the \p Extend source type, or \p MVT::Other
19051/// if no valid type can be determined
19053 switch (Extend.getOpcode()) {
19054 case ISD::SIGN_EXTEND:
19055 case ISD::ZERO_EXTEND:
19056 case ISD::ANY_EXTEND:
19057 return Extend.getOperand(0).getValueType();
19058 case ISD::AssertSext:
19059 case ISD::AssertZext:
19061 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19062 if (!TypeNode)
19063 return MVT::Other;
19064 return TypeNode->getVT();
19065 }
19066 case ISD::AND: {
19068 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
19069 if (!Constant)
19070 return MVT::Other;
19071
19072 uint32_t Mask = Constant->getZExtValue();
19073
19074 if (Mask == UCHAR_MAX)
19075 return MVT::i8;
19076 else if (Mask == USHRT_MAX)
19077 return MVT::i16;
19078 else if (Mask == UINT_MAX)
19079 return MVT::i32;
19080
19081 return MVT::Other;
19082 }
19083 default:
19084 return MVT::Other;
19085 }
19086}
19087
19088/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19089/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19090/// SExt/ZExt rather than the scalar SExt/ZExt
19092 EVT VT = BV.getValueType();
19093 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19095 return SDValue();
19096
19097 // Use the first item in the buildvector/shuffle to get the size of the
19098 // extend, and make sure it looks valid.
19099 SDValue Extend = BV->getOperand(0);
19100 unsigned ExtendOpcode = Extend.getOpcode();
19101 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19102 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19103 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19104 ExtendOpcode == ISD::AssertSext;
19105 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19106 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19107 return SDValue();
19108 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19109 // ensure calculatePreExtendType will work without issue.
19110 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19111 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19112 return SDValue();
19113
19114 // Restrict valid pre-extend data type
19115 EVT PreExtendType = calculatePreExtendType(Extend);
19116 if (PreExtendType == MVT::Other ||
19117 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19118 return SDValue();
19119
19120 // Make sure all other operands are equally extended.
19121 bool SeenZExtOrSExt = !IsAnyExt;
19122 for (SDValue Op : drop_begin(BV->ops())) {
19123 if (Op.isUndef())
19124 continue;
19125
19126 if (calculatePreExtendType(Op) != PreExtendType)
19127 return SDValue();
19128
19129 unsigned Opc = Op.getOpcode();
19130 if (Opc == ISD::ANY_EXTEND)
19131 continue;
19132
19133 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19135
19136 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19137 return SDValue();
19138
19139 IsSExt = OpcIsSExt;
19140 SeenZExtOrSExt = true;
19141 }
19142
19143 SDValue NBV;
19144 SDLoc DL(BV);
19145 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19146 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19147 EVT PreExtendLegalType =
19148 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19150 for (SDValue Op : BV->ops())
19151 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19152 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19153 PreExtendLegalType));
19154 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19155 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19156 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19157 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19158 BV.getOperand(1).isUndef()
19159 ? DAG.getUNDEF(PreExtendVT)
19160 : BV.getOperand(1).getOperand(0),
19161 cast<ShuffleVectorSDNode>(BV)->getMask());
19162 }
19163 unsigned ExtOpc = !SeenZExtOrSExt
19165 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19166 return DAG.getNode(ExtOpc, DL, VT, NBV);
19167}
19168
19169/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19170/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19172 // If the value type isn't a vector, none of the operands are going to be dups
19173 EVT VT = Mul->getValueType(0);
19174 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19175 return SDValue();
19176
19177 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19178 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19179
19180 // Neither operands have been changed, don't make any further changes
19181 if (!Op0 && !Op1)
19182 return SDValue();
19183
19184 SDLoc DL(Mul);
19185 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19186 Op1 ? Op1 : Mul->getOperand(1));
19187}
19188
19189// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19190// Same for other types with equivalent constants.
19192 EVT VT = N->getValueType(0);
19193 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19194 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19195 return SDValue();
19196 if (N->getOperand(0).getOpcode() != ISD::AND ||
19197 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19198 return SDValue();
19199
19200 SDValue And = N->getOperand(0);
19201 SDValue Srl = And.getOperand(0);
19202
19203 APInt V1, V2, V3;
19204 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19205 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19207 return SDValue();
19208
19209 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19210 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19211 V3 != (HalfSize - 1))
19212 return SDValue();
19213
19214 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19215 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19216 VT.getVectorElementCount() * 2);
19217
19218 SDLoc DL(N);
19219 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19220 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19221 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19222 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19223}
19224
19225// Transform vector add(zext i8 to i32, zext i8 to i32)
19226// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19227// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19228// extends.
19230 EVT VT = N->getValueType(0);
19231 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19232 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19233 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19234 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19235 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19236 N->getOperand(0).getOperand(0).getValueType() !=
19237 N->getOperand(1).getOperand(0).getValueType())
19238 return SDValue();
19239
19240 if (N->getOpcode() == ISD::MUL &&
19241 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19242 return SDValue();
19243
19244 SDValue N0 = N->getOperand(0).getOperand(0);
19245 SDValue N1 = N->getOperand(1).getOperand(0);
19246 EVT InVT = N0.getValueType();
19247
19248 EVT S1 = InVT.getScalarType();
19249 EVT S2 = VT.getScalarType();
19250 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19251 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19252 SDLoc DL(N);
19253 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19256 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19257 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19258 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19259 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
19260 : (unsigned)ISD::SIGN_EXTEND,
19261 DL, VT, NewOp);
19262 }
19263 return SDValue();
19264}
19265
19268 const AArch64Subtarget *Subtarget) {
19269
19270 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
19271 return Ext;
19273 return Ext;
19274 if (SDValue Ext = performVectorExtCombine(N, DAG))
19275 return Ext;
19276
19277 if (DCI.isBeforeLegalizeOps())
19278 return SDValue();
19279
19280 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
19281 // and in MachineCombiner pass, add+mul will be combined into madd.
19282 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
19283 SDLoc DL(N);
19284 EVT VT = N->getValueType(0);
19285 SDValue N0 = N->getOperand(0);
19286 SDValue N1 = N->getOperand(1);
19287 SDValue MulOper;
19288 unsigned AddSubOpc;
19289
19290 auto IsAddSubWith1 = [&](SDValue V) -> bool {
19291 AddSubOpc = V->getOpcode();
19292 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
19293 SDValue Opnd = V->getOperand(1);
19294 MulOper = V->getOperand(0);
19295 if (AddSubOpc == ISD::SUB)
19296 std::swap(Opnd, MulOper);
19297 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
19298 return C->isOne();
19299 }
19300 return false;
19301 };
19302
19303 if (IsAddSubWith1(N0)) {
19304 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
19305 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
19306 }
19307
19308 if (IsAddSubWith1(N1)) {
19309 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
19310 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
19311 }
19312
19313 // The below optimizations require a constant RHS.
19314 if (!isa<ConstantSDNode>(N1))
19315 return SDValue();
19316
19317 ConstantSDNode *C = cast<ConstantSDNode>(N1);
19318 const APInt &ConstValue = C->getAPIntValue();
19319
19320 // Allow the scaling to be folded into the `cnt` instruction by preventing
19321 // the scaling to be obscured here. This makes it easier to pattern match.
19322 if (IsSVECntIntrinsic(N0) ||
19323 (N0->getOpcode() == ISD::TRUNCATE &&
19324 (IsSVECntIntrinsic(N0->getOperand(0)))))
19325 if (ConstValue.sge(1) && ConstValue.sle(16))
19326 return SDValue();
19327
19328 // Multiplication of a power of two plus/minus one can be done more
19329 // cheaply as shift+add/sub. For now, this is true unilaterally. If
19330 // future CPUs have a cheaper MADD instruction, this may need to be
19331 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
19332 // 64-bit is 5 cycles, so this is always a win.
19333 // More aggressively, some multiplications N0 * C can be lowered to
19334 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
19335 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
19336 // TODO: lower more cases.
19337
19338 // TrailingZeroes is used to test if the mul can be lowered to
19339 // shift+add+shift.
19340 unsigned TrailingZeroes = ConstValue.countr_zero();
19341 if (TrailingZeroes) {
19342 // Conservatively do not lower to shift+add+shift if the mul might be
19343 // folded into smul or umul.
19344 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
19345 isZeroExtended(N0, DAG)))
19346 return SDValue();
19347 // Conservatively do not lower to shift+add+shift if the mul might be
19348 // folded into madd or msub.
19349 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
19350 N->user_begin()->getOpcode() == ISD::SUB))
19351 return SDValue();
19352 }
19353 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
19354 // and shift+add+shift.
19355 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
19356 unsigned ShiftAmt;
19357
19358 auto Shl = [&](SDValue N0, unsigned N1) {
19359 if (!N0.getNode())
19360 return SDValue();
19361 // If shift causes overflow, ignore this combine.
19362 if (N1 >= N0.getValueSizeInBits())
19363 return SDValue();
19364 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
19365 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
19366 };
19367 auto Add = [&](SDValue N0, SDValue N1) {
19368 if (!N0.getNode() || !N1.getNode())
19369 return SDValue();
19370 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19371 };
19372 auto Sub = [&](SDValue N0, SDValue N1) {
19373 if (!N0.getNode() || !N1.getNode())
19374 return SDValue();
19375 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19376 };
19377 auto Negate = [&](SDValue N) {
19378 if (!N0.getNode())
19379 return SDValue();
19380 SDValue Zero = DAG.getConstant(0, DL, VT);
19381 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19382 };
19383
19384 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19385 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19386 // the (2^N - 1) can't be execused via a single instruction.
19387 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19388 unsigned BitWidth = C.getBitWidth();
19389 for (unsigned i = 1; i < BitWidth / 2; i++) {
19390 APInt Rem;
19391 APInt X(BitWidth, (1 << i) + 1);
19392 APInt::sdivrem(C, X, N, Rem);
19393 APInt NVMinus1 = N - 1;
19394 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19395 M = X;
19396 return true;
19397 }
19398 }
19399 return false;
19400 };
19401
19402 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19403 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19404 // the (2^N - 1) can't be execused via a single instruction.
19405 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19406 APInt CVMinus1 = C - 1;
19407 if (CVMinus1.isNegative())
19408 return false;
19409 unsigned TrailingZeroes = CVMinus1.countr_zero();
19410 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19411 if (SCVMinus1.isPowerOf2()) {
19412 unsigned BitWidth = SCVMinus1.getBitWidth();
19413 M = APInt(BitWidth, SCVMinus1.logBase2());
19414 N = APInt(BitWidth, TrailingZeroes);
19415 return true;
19416 }
19417 return false;
19418 };
19419
19420 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19421 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19422 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19423 APInt CVMinus1 = C - 1;
19424 if (CVMinus1.isNegative())
19425 return false;
19426 unsigned TrailingZeroes = CVMinus1.countr_zero();
19427 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19428 if (CVPlus1.isPowerOf2()) {
19429 unsigned BitWidth = CVPlus1.getBitWidth();
19430 M = APInt(BitWidth, CVPlus1.logBase2());
19431 N = APInt(BitWidth, TrailingZeroes);
19432 return true;
19433 }
19434 return false;
19435 };
19436
19437 if (ConstValue.isNonNegative()) {
19438 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19439 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19440 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19441 // (mul x, (2^M + 1) * (2^N + 1))
19442 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19443 // (mul x, (2^M + 1) * 2^N + 1))
19444 // => MV = add (shl x, M), x); add (shl MV, N), x)
19445 // (mul x, 1 - (1 - 2^M) * 2^N))
19446 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19447 APInt SCVMinus1 = ShiftedConstValue - 1;
19448 APInt SCVPlus1 = ShiftedConstValue + 1;
19449 APInt CVPlus1 = ConstValue + 1;
19450 APInt CVM, CVN;
19451 if (SCVMinus1.isPowerOf2()) {
19452 ShiftAmt = SCVMinus1.logBase2();
19453 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19454 } else if (CVPlus1.isPowerOf2()) {
19455 ShiftAmt = CVPlus1.logBase2();
19456 return Sub(Shl(N0, ShiftAmt), N0);
19457 } else if (SCVPlus1.isPowerOf2()) {
19458 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19459 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19460 }
19461 if (Subtarget->hasALULSLFast() &&
19462 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19463 APInt CVMMinus1 = CVM - 1;
19464 APInt CVNMinus1 = CVN - 1;
19465 unsigned ShiftM1 = CVMMinus1.logBase2();
19466 unsigned ShiftN1 = CVNMinus1.logBase2();
19467 // ALULSLFast implicate that Shifts <= 4 places are fast
19468 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19469 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19470 return Add(Shl(MVal, ShiftN1), MVal);
19471 }
19472 }
19473 if (Subtarget->hasALULSLFast() &&
19474 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19475 unsigned ShiftM = CVM.getZExtValue();
19476 unsigned ShiftN = CVN.getZExtValue();
19477 // ALULSLFast implicate that Shifts <= 4 places are fast
19478 if (ShiftM <= 4 && ShiftN <= 4) {
19479 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19480 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19481 }
19482 }
19483
19484 if (Subtarget->hasALULSLFast() &&
19485 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19486 unsigned ShiftM = CVM.getZExtValue();
19487 unsigned ShiftN = CVN.getZExtValue();
19488 // ALULSLFast implicate that Shifts <= 4 places are fast
19489 if (ShiftM <= 4 && ShiftN <= 4) {
19490 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19491 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19492 }
19493 }
19494 } else {
19495 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19496 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19497 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19498 APInt SCVPlus1 = -ShiftedConstValue + 1;
19499 APInt CVNegPlus1 = -ConstValue + 1;
19500 APInt CVNegMinus1 = -ConstValue - 1;
19501 if (CVNegPlus1.isPowerOf2()) {
19502 ShiftAmt = CVNegPlus1.logBase2();
19503 return Sub(N0, Shl(N0, ShiftAmt));
19504 } else if (CVNegMinus1.isPowerOf2()) {
19505 ShiftAmt = CVNegMinus1.logBase2();
19506 return Negate(Add(Shl(N0, ShiftAmt), N0));
19507 } else if (SCVPlus1.isPowerOf2()) {
19508 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19509 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19510 }
19511 }
19512
19513 return SDValue();
19514}
19515
19517 SelectionDAG &DAG) {
19518 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19519 // optimize away operation when it's from a constant.
19520 //
19521 // The general transformation is:
19522 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19523 // AND(VECTOR_CMP(x,y), constant2)
19524 // constant2 = UNARYOP(constant)
19525
19526 // Early exit if this isn't a vector operation, the operand of the
19527 // unary operation isn't a bitwise AND, or if the sizes of the operations
19528 // aren't the same.
19529 EVT VT = N->getValueType(0);
19530 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19531 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19532 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19533 return SDValue();
19534
19535 // Now check that the other operand of the AND is a constant. We could
19536 // make the transformation for non-constant splats as well, but it's unclear
19537 // that would be a benefit as it would not eliminate any operations, just
19538 // perform one more step in scalar code before moving to the vector unit.
19539 if (BuildVectorSDNode *BV =
19540 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19541 // Bail out if the vector isn't a constant.
19542 if (!BV->isConstant())
19543 return SDValue();
19544
19545 // Everything checks out. Build up the new and improved node.
19546 SDLoc DL(N);
19547 EVT IntVT = BV->getValueType(0);
19548 // Create a new constant of the appropriate type for the transformed
19549 // DAG.
19550 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19551 // The AND node needs bitcasts to/from an integer vector type around it.
19552 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19553 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19554 N->getOperand(0)->getOperand(0), MaskConst);
19555 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19556 return Res;
19557 }
19558
19559 return SDValue();
19560}
19561
19562/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19563/// functions, this can help to reduce the number of fmovs to/from GPRs.
19564static SDValue
19567 const AArch64Subtarget *Subtarget) {
19568 if (N->isStrictFPOpcode())
19569 return SDValue();
19570
19571 if (DCI.isBeforeLegalizeOps())
19572 return SDValue();
19573
19574 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19575 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19576 return SDValue();
19577
19578 auto isSupportedType = [](EVT VT) {
19579 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19580 };
19581
19582 SDValue SrcVal = N->getOperand(0);
19583 EVT SrcTy = SrcVal.getValueType();
19584 EVT DestTy = N->getValueType(0);
19585
19586 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19587 return SDValue();
19588
19589 EVT SrcVecTy;
19590 EVT DestVecTy;
19591 if (DestTy.bitsGT(SrcTy)) {
19592 DestVecTy = getPackedSVEVectorVT(DestTy);
19593 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19594 } else {
19595 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19596 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19597 }
19598
19599 // Ensure the resulting src/dest vector type is legal.
19600 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19601 return SDValue();
19602
19603 SDLoc DL(N);
19604 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19605 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19606 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19607 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19608 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19609}
19610
19613 const AArch64Subtarget *Subtarget) {
19614 // First try to optimize away the conversion when it's conditionally from
19615 // a constant. Vectors only.
19617 return Res;
19618
19619 if (SDValue Res =
19620 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19621 return Res;
19622
19623 EVT VT = N->getValueType(0);
19624 if (VT != MVT::f32 && VT != MVT::f64)
19625 return SDValue();
19626
19627 // Only optimize when the source and destination types have the same width.
19628 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19629 return SDValue();
19630
19631 // If the result of an integer load is only used by an integer-to-float
19632 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19633 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19634 SDValue N0 = N->getOperand(0);
19635 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19636 N0.hasOneUse() &&
19637 // Do not change the width of a volatile load.
19638 !cast<LoadSDNode>(N0)->isVolatile()) {
19639 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19640 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19641 LN0->getPointerInfo(), LN0->getAlign(),
19642 LN0->getMemOperand()->getFlags());
19643
19644 // Make sure successors of the original load stay after it by updating them
19645 // to use the new Chain.
19646 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19647
19648 unsigned Opcode =
19649 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19650 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19651 }
19652
19653 return SDValue();
19654}
19655
19656/// Fold a floating-point multiply by power of two into floating-point to
19657/// fixed-point conversion.
19660 const AArch64Subtarget *Subtarget) {
19661 if (SDValue Res =
19662 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19663 return Res;
19664
19665 if (!Subtarget->isNeonAvailable())
19666 return SDValue();
19667
19668 if (!N->getValueType(0).isSimple())
19669 return SDValue();
19670
19671 SDValue Op = N->getOperand(0);
19672 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19673 return SDValue();
19674
19675 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19676 return SDValue();
19677
19678 SDValue ConstVec = Op->getOperand(1);
19679 if (!isa<BuildVectorSDNode>(ConstVec))
19680 return SDValue();
19681
19682 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19683 uint32_t FloatBits = FloatTy.getSizeInBits();
19684 if (FloatBits != 32 && FloatBits != 64 &&
19685 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19686 return SDValue();
19687
19688 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19689 uint32_t IntBits = IntTy.getSizeInBits();
19690 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19691 return SDValue();
19692
19693 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19694 if (IntBits > FloatBits)
19695 return SDValue();
19696
19697 BitVector UndefElements;
19698 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
19699 int32_t Bits = IntBits == 64 ? 64 : 32;
19700 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19701 if (C == -1 || C == 0 || C > Bits)
19702 return SDValue();
19703
19704 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19705 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19706 return SDValue();
19707
19708 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19709 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19710 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19711 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19712 return SDValue();
19713 }
19714
19715 SDLoc DL(N);
19716 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19717 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19718 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19719 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19720 SDValue FixConv =
19722 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19723 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19724 // We can handle smaller integers by generating an extra trunc.
19725 if (IntBits < FloatBits)
19726 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19727
19728 return FixConv;
19729}
19730
19731// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19732// convert to csel(ccmp(.., cc0)), depending on cc1:
19733
19734// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19735// =>
19736// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19737//
19738// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19739// =>
19740// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19742 EVT VT = N->getValueType(0);
19743 SDValue CSel0 = N->getOperand(0);
19744 SDValue CSel1 = N->getOperand(1);
19745
19746 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19747 CSel1.getOpcode() != AArch64ISD::CSEL)
19748 return SDValue();
19749
19750 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19751 return SDValue();
19752
19753 if (!isNullConstant(CSel0.getOperand(0)) ||
19754 !isOneConstant(CSel0.getOperand(1)) ||
19755 !isNullConstant(CSel1.getOperand(0)) ||
19756 !isOneConstant(CSel1.getOperand(1)))
19757 return SDValue();
19758
19759 SDValue Cmp0 = CSel0.getOperand(3);
19760 SDValue Cmp1 = CSel1.getOperand(3);
19763 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19764 return SDValue();
19765 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19766 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19767 std::swap(Cmp0, Cmp1);
19768 std::swap(CC0, CC1);
19769 }
19770
19771 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19772 return SDValue();
19773
19774 SDLoc DL(N);
19775 SDValue CCmp, Condition;
19776 unsigned NZCV;
19777
19778 if (N->getOpcode() == ISD::AND) {
19780 Condition = getCondCode(DAG, InvCC0);
19782 } else {
19784 Condition = getCondCode(DAG, CC0);
19786 }
19787
19788 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19789
19790 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19791 if (Op1 && Op1->getAPIntValue().isNegative() &&
19792 Op1->getAPIntValue().sgt(-32)) {
19793 // CCMP accept the constant int the range [0, 31]
19794 // if the Op1 is a constant in the range [-31, -1], we
19795 // can select to CCMN to avoid the extra mov
19796 SDValue AbsOp1 =
19797 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19798 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
19799 AbsOp1, NZCVOp, Condition, Cmp0);
19800 } else {
19801 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
19802 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19803 }
19804 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19805 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
19806}
19807
19809 const AArch64Subtarget *Subtarget,
19810 const AArch64TargetLowering &TLI) {
19811 SelectionDAG &DAG = DCI.DAG;
19812
19813 if (SDValue R = performANDORCSELCombine(N, DAG))
19814 return R;
19815
19816 return SDValue();
19817}
19818
19820 if (!MemVT.getVectorElementType().isSimple())
19821 return false;
19822
19823 uint64_t MaskForTy = 0ull;
19824 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19825 case MVT::i8:
19826 MaskForTy = 0xffull;
19827 break;
19828 case MVT::i16:
19829 MaskForTy = 0xffffull;
19830 break;
19831 case MVT::i32:
19832 MaskForTy = 0xffffffffull;
19833 break;
19834 default:
19835 return false;
19836 break;
19837 }
19838
19839 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19840 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
19841 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19842
19843 return false;
19844}
19845
19847 SDValue LeafOp = SDValue(N, 0);
19848 SDValue Op = N->getOperand(0);
19849 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19850 LeafOp.getValueType() != Op.getValueType())
19851 Op = Op->getOperand(0);
19852 if (LeafOp.getValueType() == Op.getValueType())
19853 return Op;
19854 return SDValue();
19855}
19856
19859 SelectionDAG &DAG = DCI.DAG;
19860 SDValue Src = N->getOperand(0);
19861 unsigned Opc = Src->getOpcode();
19862
19863 // Zero/any extend of an unsigned unpack
19864 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19865 SDValue UnpkOp = Src->getOperand(0);
19866 SDValue Dup = N->getOperand(1);
19867
19868 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19869 return SDValue();
19870
19871 SDLoc DL(N);
19872 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
19873 if (!C)
19874 return SDValue();
19875
19876 uint64_t ExtVal = C->getZExtValue();
19877
19878 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19879 return ((ExtVal == 0xFF && VT == MVT::i8) ||
19880 (ExtVal == 0xFFFF && VT == MVT::i16) ||
19881 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19882 };
19883
19884 // If the mask is fully covered by the unpack, we don't need to push
19885 // a new AND onto the operand
19886 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
19887 if (MaskAndTypeMatch(EltTy))
19888 return Src;
19889
19890 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19891 // to see if the mask is all-ones of size MemTy.
19892 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
19893 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19894 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19895 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19896 if (MaskAndTypeMatch(EltTy))
19897 return Src;
19898 }
19899
19900 // Truncate to prevent a DUP with an over wide constant
19901 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
19902
19903 // Otherwise, make sure we propagate the AND to the operand
19904 // of the unpack
19905 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
19906 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
19907
19908 SDValue And = DAG.getNode(ISD::AND, DL,
19909 UnpkOp->getValueType(0), UnpkOp, Dup);
19910
19911 return DAG.getNode(Opc, DL, N->getValueType(0), And);
19912 }
19913
19914 if (DCI.isBeforeLegalizeOps())
19915 return SDValue();
19916
19917 // If both sides of AND operations are i1 splat_vectors then
19918 // we can produce just i1 splat_vector as the result.
19919 if (isAllActivePredicate(DAG, N->getOperand(0)))
19920 return N->getOperand(1);
19921 if (isAllActivePredicate(DAG, N->getOperand(1)))
19922 return N->getOperand(0);
19923
19925 return SDValue();
19926
19927 SDValue Mask = N->getOperand(1);
19928
19929 if (!Src.hasOneUse())
19930 return SDValue();
19931
19932 EVT MemVT;
19933
19934 // SVE load instructions perform an implicit zero-extend, which makes them
19935 // perfect candidates for combining.
19936 switch (Opc) {
19937 case AArch64ISD::LD1_MERGE_ZERO:
19938 case AArch64ISD::LDNF1_MERGE_ZERO:
19939 case AArch64ISD::LDFF1_MERGE_ZERO:
19940 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
19941 break;
19942 case AArch64ISD::GLD1_MERGE_ZERO:
19943 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
19944 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
19945 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
19946 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
19947 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
19948 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
19949 case AArch64ISD::GLDFF1_MERGE_ZERO:
19950 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
19951 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
19952 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
19953 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
19954 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
19955 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
19956 case AArch64ISD::GLDNT1_MERGE_ZERO:
19957 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
19958 break;
19959 default:
19960 return SDValue();
19961 }
19962
19963 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
19964 return Src;
19965
19966 return SDValue();
19967}
19968
19969// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19972
19973 // This function performs an optimization on a specific pattern involving
19974 // an AND operation and SETCC (Set Condition Code) node.
19975
19976 SDValue SetCC = N->getOperand(0);
19977 EVT VT = N->getValueType(0);
19978 SelectionDAG &DAG = DCI.DAG;
19979
19980 // Checks if the current node (N) is used by any SELECT instruction and
19981 // returns an empty SDValue to avoid applying the optimization to prevent
19982 // incorrect results
19983 for (auto U : N->users())
19984 if (U->getOpcode() == ISD::SELECT)
19985 return SDValue();
19986
19987 // Check if the operand is a SETCC node with floating-point comparison
19988 if (SetCC.getOpcode() == ISD::SETCC &&
19989 SetCC.getOperand(0).getValueType() == MVT::f32) {
19990
19991 SDValue Cmp;
19993
19994 // Check if the DAG is after legalization and if we can emit the conjunction
19995 if (!DCI.isBeforeLegalize() &&
19996 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19997
19999
20000 SDLoc DL(N);
20001 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20002 DAG.getConstant(0, DL, VT),
20003 getCondCode(DAG, InvertedCC), Cmp);
20004 }
20005 }
20006 return SDValue();
20007}
20008
20011 SelectionDAG &DAG = DCI.DAG;
20012 SDValue LHS = N->getOperand(0);
20013 SDValue RHS = N->getOperand(1);
20014 EVT VT = N->getValueType(0);
20015
20016 if (SDValue R = performANDORCSELCombine(N, DAG))
20017 return R;
20018
20019 if (SDValue R = performANDSETCCCombine(N,DCI))
20020 return R;
20021
20022 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20023 return SDValue();
20024
20025 if (VT.isScalableVector())
20026 return performSVEAndCombine(N, DCI);
20027
20028 // The combining code below works only for NEON vectors. In particular, it
20029 // does not work for SVE when dealing with vectors wider than 128 bits.
20030 if (!VT.is64BitVector() && !VT.is128BitVector())
20031 return SDValue();
20032
20033 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
20034 if (!BVN)
20035 return SDValue();
20036
20037 // AND does not accept an immediate, so check if we can use a BIC immediate
20038 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20039 // pattern in isel, because some immediates may be lowered to the preferred
20040 // (and x, (movi imm)) form, even though an mvni representation also exists.
20041 APInt DefBits(VT.getSizeInBits(), 0);
20042 APInt UndefBits(VT.getSizeInBits(), 0);
20043 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20044 SDValue NewOp;
20045
20046 // Any bits known to already be 0 need not be cleared again, which can help
20047 // reduce the size of the immediate to one supported by the instruction.
20048 KnownBits Known = DAG.computeKnownBits(LHS);
20049 APInt ZeroSplat(VT.getSizeInBits(), 0);
20050 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20051 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20052 << (Known.Zero.getBitWidth() * I);
20053
20054 DefBits = ~(DefBits | ZeroSplat);
20055 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20056 DefBits, &LHS)) ||
20057 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20058 DefBits, &LHS)))
20059 return NewOp;
20060
20061 UndefBits = ~(UndefBits | ZeroSplat);
20062 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20063 UndefBits, &LHS)) ||
20064 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20065 UndefBits, &LHS)))
20066 return NewOp;
20067 }
20068
20069 return SDValue();
20070}
20071
20074 SelectionDAG &DAG = DCI.DAG;
20075 SDValue LHS = N->getOperand(0);
20076 SDValue RHS = N->getOperand(1);
20077 EVT VT = N->getValueType(0);
20078 SDLoc DL(N);
20079
20080 if (!N->getFlags().hasAllowReassociation())
20081 return SDValue();
20082
20083 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20084 auto ReassocComplex = [&](SDValue A, SDValue B) {
20085 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20086 return SDValue();
20087 unsigned Opc = A.getConstantOperandVal(0);
20088 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20089 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20090 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20091 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20092 return SDValue();
20093 SDValue VCMLA = DAG.getNode(
20094 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20095 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20096 A.getOperand(2), A.getOperand(3));
20097 VCMLA->setFlags(A->getFlags());
20098 return VCMLA;
20099 };
20100 if (SDValue R = ReassocComplex(LHS, RHS))
20101 return R;
20102 if (SDValue R = ReassocComplex(RHS, LHS))
20103 return R;
20104
20105 return SDValue();
20106}
20107
20108static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20109 switch (Opcode) {
20110 case ISD::STRICT_FADD:
20111 case ISD::FADD:
20112 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20113 case ISD::ADD:
20114 return VT == MVT::i64;
20115 default:
20116 return false;
20117 }
20118}
20119
20120static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20122
20124 if ((N.getOpcode() == ISD::SETCC) ||
20125 // get_active_lane_mask is lowered to a whilelo instruction.
20126 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20127 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20128 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20129 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20130 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20131 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20132 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20133 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20134 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20135 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt)))
20136 return true;
20137
20138 return false;
20139}
20140
20141// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20142// ... into: "ptrue p, all" + PTEST
20143static SDValue
20146 const AArch64Subtarget *Subtarget) {
20147 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20148 // Make sure PTEST can be legalised with illegal types.
20149 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20150 return SDValue();
20151
20152 SDValue N0 = N->getOperand(0);
20153 EVT VT = N0.getValueType();
20154
20155 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20156 !isNullConstant(N->getOperand(1)))
20157 return SDValue();
20158
20159 // Restricted the DAG combine to only cases where we're extracting from a
20160 // flag-setting operation.
20161 if (!isPredicateCCSettingOp(N0))
20162 return SDValue();
20163
20164 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20165 SelectionDAG &DAG = DCI.DAG;
20166 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20167 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20168}
20169
20170// Materialize : Idx = (add (mul vscale, NumEls), -1)
20171// i1 = extract_vector_elt t37, Constant:i64<Idx>
20172// ... into: "ptrue p, all" + PTEST
20173static SDValue
20176 const AArch64Subtarget *Subtarget) {
20177 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20178 // Make sure PTEST is legal types.
20179 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20180 return SDValue();
20181
20182 SDValue N0 = N->getOperand(0);
20183 EVT OpVT = N0.getValueType();
20184
20185 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20186 return SDValue();
20187
20188 // Idx == (add (mul vscale, NumEls), -1)
20189 SDValue Idx = N->getOperand(1);
20190 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20191 return SDValue();
20192
20193 SDValue VS = Idx.getOperand(0);
20194 if (VS.getOpcode() != ISD::VSCALE)
20195 return SDValue();
20196
20197 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20198 if (VS.getConstantOperandVal(0) != NumEls)
20199 return SDValue();
20200
20201 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20202 SelectionDAG &DAG = DCI.DAG;
20203 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20204 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20205}
20206
20207static SDValue
20209 const AArch64Subtarget *Subtarget) {
20210 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20211 SelectionDAG &DAG = DCI.DAG;
20212 SDValue Vec = N->getOperand(0);
20213 SDValue Idx = N->getOperand(1);
20214
20215 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20216 return SDValue();
20217
20218 // Only legal for 8, 16, 32, and 64 bit element types.
20219 EVT EltVT = Vec.getValueType().getVectorElementType();
20220 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20221 MVT::bf16, MVT::f32, MVT::f64}),
20222 EltVT.getSimpleVT().SimpleTy))
20223 return SDValue();
20224
20225 SDValue Mask = Idx.getOperand(0);
20226 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20227 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20228 return SDValue();
20229
20230 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20231 Vec);
20232}
20233
20234static SDValue
20236 const AArch64Subtarget *Subtarget) {
20237 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20238 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20239 return Res;
20240 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20241 return Res;
20242 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20243 return Res;
20244
20245 SelectionDAG &DAG = DCI.DAG;
20246 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20247
20248 EVT VT = N->getValueType(0);
20249 const bool FullFP16 = Subtarget->hasFullFP16();
20250 bool IsStrict = N0->isStrictFPOpcode();
20251
20252 // extract(dup x) -> x
20253 if (N0.getOpcode() == AArch64ISD::DUP)
20254 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
20255 : N0.getOperand(0);
20256
20257 // Rewrite for pairwise fadd pattern
20258 // (f32 (extract_vector_elt
20259 // (fadd (vXf32 Other)
20260 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
20261 // ->
20262 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
20263 // (extract_vector_elt (vXf32 Other) 1))
20264 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
20265 // we can only do this when it's used only by the extract_vector_elt.
20266 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
20267 (!IsStrict || N0.hasOneUse())) {
20268 SDLoc DL(N0);
20269 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
20270 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
20271
20272 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
20273 SDValue Other = N00;
20274
20275 // And handle the commutative case.
20276 if (!Shuffle) {
20277 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
20278 Other = N01;
20279 }
20280
20281 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
20282 Other == Shuffle->getOperand(0)) {
20283 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20284 DAG.getConstant(0, DL, MVT::i64));
20285 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20286 DAG.getConstant(1, DL, MVT::i64));
20287 if (!IsStrict)
20288 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20289
20290 // For strict_fadd we need uses of the final extract_vector to be replaced
20291 // with the strict_fadd, but we also need uses of the chain output of the
20292 // original strict_fadd to use the chain output of the new strict_fadd as
20293 // otherwise it may not be deleted.
20294 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20295 {VT, MVT::Other},
20296 {N0->getOperand(0), Extract1, Extract2});
20297 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20298 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20299 return SDValue(N, 0);
20300 }
20301 }
20302
20303 return SDValue();
20304}
20305
20308 SelectionDAG &DAG) {
20309 SDLoc DL(N);
20310 EVT VT = N->getValueType(0);
20311 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20312 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20313
20314 if (VT.isScalableVector())
20315 return SDValue();
20316
20317 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20318 N1Opc == ISD::TRUNCATE) {
20319 SDValue N00 = N0->getOperand(0);
20320 SDValue N10 = N1->getOperand(0);
20321 EVT N00VT = N00.getValueType();
20322 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20323
20324 // Optimize concat_vectors of truncated vectors, where the intermediate
20325 // type is illegal, to avoid said illegality, e.g.,
20326 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20327 // (v2i16 (truncate (v2i64)))))
20328 // ->
20329 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20330 // (v4i32 (bitcast (v2i64))),
20331 // <0, 2, 4, 6>)))
20332 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20333 // on both input and result type, so we might generate worse code.
20334 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20335 if (N00VT == N10.getValueType() &&
20336 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20337 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20338 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20340 for (size_t i = 0; i < Mask.size(); ++i)
20341 Mask[i] = i * 2;
20342 return DAG.getNode(ISD::TRUNCATE, DL, VT,
20343 DAG.getVectorShuffle(
20344 MidVT, DL,
20345 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
20346 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
20347 }
20348
20349 // Optimize two large shifts and a combine into a single combine and shift
20350 // For AArch64 architectures, sequences like the following:
20351 //
20352 // ushr v0.4s, v0.4s, #20
20353 // ushr v1.4s, v1.4s, #20
20354 // uzp1 v0.8h, v0.8h, v1.8h
20355 //
20356 // Can be optimized to:
20357 //
20358 // uzp2 v0.8h, v0.8h, v1.8h
20359 // ushr v0.8h, v0.8h, #4
20360 //
20361 // This optimization reduces instruction count.
20362 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20363 N00->getOperand(1) == N10->getOperand(1)) {
20364 SDValue N000 = N00->getOperand(0);
20365 SDValue N100 = N10->getOperand(0);
20366 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20367 N101ConstVal = N10->getConstantOperandVal(1),
20368 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20369
20370 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20371 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
20372 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
20373 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
20374 SDValue NewShiftConstant =
20375 DAG.getConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
20376
20377 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
20378 }
20379 }
20380 }
20381
20382 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20383 N->getOperand(0).getValueType() == MVT::v2i16 ||
20384 N->getOperand(0).getValueType() == MVT::v2i8) {
20385 EVT SrcVT = N->getOperand(0).getValueType();
20386 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20387 // loads to prevent having to go through the v4i8 load legalization that
20388 // needs to extend each element into a larger type.
20389 if (N->getNumOperands() % 2 == 0 &&
20390 all_of(N->op_values(), [SrcVT](SDValue V) {
20391 if (V.getValueType() != SrcVT)
20392 return false;
20393 if (V.isUndef())
20394 return true;
20395 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20396 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20397 LD->getExtensionType() == ISD::NON_EXTLOAD;
20398 })) {
20399 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20400 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20402
20403 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20404 SDValue V = N->getOperand(i);
20405 if (V.isUndef())
20406 Ops.push_back(DAG.getUNDEF(FVT));
20407 else {
20408 LoadSDNode *LD = cast<LoadSDNode>(V);
20409 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
20410 LD->getBasePtr(), LD->getMemOperand());
20411 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20412 Ops.push_back(NewLoad);
20413 }
20414 }
20415 return DAG.getBitcast(N->getValueType(0),
20416 DAG.getBuildVector(NVT, DL, Ops));
20417 }
20418 }
20419
20420 // Canonicalise concat_vectors to replace concatenations of truncated nots
20421 // with nots of concatenated truncates. This in some cases allows for multiple
20422 // redundant negations to be eliminated.
20423 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20424 // (v4i16 (truncate (not (v4i32)))))
20425 // ->
20426 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20427 // (v4i16 (truncate (v4i32)))))
20428 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20429 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20430 N->isOnlyUserOf(N1.getNode())) {
20431 auto isBitwiseVectorNegate = [](SDValue V) {
20432 return V->getOpcode() == ISD::XOR &&
20433 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20434 };
20435 SDValue N00 = N0->getOperand(0);
20436 SDValue N10 = N1->getOperand(0);
20437 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20438 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20439 return DAG.getNOT(
20440 DL,
20443 N00->getOperand(0)),
20445 N10->getOperand(0))),
20446 VT);
20447 }
20448 }
20449
20450 // Wait till after everything is legalized to try this. That way we have
20451 // legal vector types and such.
20452 if (DCI.isBeforeLegalizeOps())
20453 return SDValue();
20454
20455 // Optimise concat_vectors of two identical binops with a 128-bit destination
20456 // size, combine into an binop of two contacts of the source vectors. eg:
20457 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20458 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20459 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20460 isVectorizedBinOp(N0Opc)) &&
20461 N0->hasOneUse() && N1->hasOneUse()) {
20462 SDValue N00 = N0->getOperand(0);
20463 SDValue N01 = N0->getOperand(1);
20464 SDValue N10 = N1->getOperand(0);
20465 SDValue N11 = N1->getOperand(1);
20466
20467 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20468 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
20469 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
20470 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
20471 }
20472 }
20473
20474 auto IsRSHRN = [](SDValue Shr) {
20475 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20476 return false;
20477 SDValue Op = Shr.getOperand(0);
20478 EVT VT = Op.getValueType();
20479 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20480 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20481 return false;
20482
20483 APInt Imm;
20484 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20485 Imm = APInt(VT.getScalarSizeInBits(),
20486 Op.getOperand(1).getConstantOperandVal(0)
20487 << Op.getOperand(1).getConstantOperandVal(1));
20488 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20489 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20490 Imm = APInt(VT.getScalarSizeInBits(),
20491 Op.getOperand(1).getConstantOperandVal(0));
20492 else
20493 return false;
20494
20495 if (Imm != 1ULL << (ShtAmt - 1))
20496 return false;
20497 return true;
20498 };
20499
20500 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20501 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20502 ((IsRSHRN(N1) &&
20504 N1.isUndef())) {
20505 SDValue X = N0.getOperand(0).getOperand(0);
20506 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20507 : N1.getOperand(0).getOperand(0);
20508 EVT BVT =
20509 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20510 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
20511 SDValue Add = DAG.getNode(
20512 ISD::ADD, DL, BVT, CC,
20513 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
20514 SDValue Shr =
20515 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
20516 return Shr;
20517 }
20518
20519 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20520 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20521 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20522 N0.getOperand(1) == N1.getOperand(1)) {
20523 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
20524 DAG.getUNDEF(N0.getValueType()));
20525 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
20526 DAG.getUNDEF(N0.getValueType()));
20527 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
20528 }
20529
20530 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20531 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20532 // canonicalise to that.
20533 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20534 assert(VT.getScalarSizeInBits() == 64);
20535 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
20536 DAG.getConstant(0, DL, MVT::i64));
20537 }
20538
20539 // Canonicalise concat_vectors so that the right-hand vector has as few
20540 // bit-casts as possible before its real operation. The primary matching
20541 // destination for these operations will be the narrowing "2" instructions,
20542 // which depend on the operation being performed on this right-hand vector.
20543 // For example,
20544 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20545 // becomes
20546 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20547
20548 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20549 return SDValue();
20550 SDValue RHS = N1->getOperand(0);
20551 MVT RHSTy = RHS.getValueType().getSimpleVT();
20552 // If the RHS is not a vector, this is not the pattern we're looking for.
20553 if (!RHSTy.isVector())
20554 return SDValue();
20555
20556 LLVM_DEBUG(
20557 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20558
20559 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20560 RHSTy.getVectorNumElements() * 2);
20561 return DAG.getNode(ISD::BITCAST, DL, VT,
20562 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
20563 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
20564 RHS));
20565}
20566
20567static SDValue
20569 SelectionDAG &DAG) {
20570 if (DCI.isBeforeLegalizeOps())
20571 return SDValue();
20572
20573 EVT VT = N->getValueType(0);
20574 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20575 return SDValue();
20576
20577 SDValue V = N->getOperand(0);
20578
20579 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20580 // blocks this combine because the non-const case requires custom lowering.
20581 //
20582 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20583 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20584 if (isa<ConstantSDNode>(V.getOperand(0)))
20585 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20586
20587 return SDValue();
20588}
20589
20590static SDValue
20592 SelectionDAG &DAG) {
20593 SDLoc DL(N);
20594 SDValue Vec = N->getOperand(0);
20595 SDValue SubVec = N->getOperand(1);
20596 uint64_t IdxVal = N->getConstantOperandVal(2);
20597 EVT VecVT = Vec.getValueType();
20598 EVT SubVT = SubVec.getValueType();
20599
20600 // Promote fixed length vector zeros.
20601 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20602 Vec.isUndef() && isZerosVector(SubVec.getNode()))
20603 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
20604 : DAG.getConstantFP(0, DL, VecVT);
20605
20606 // Only do this for legal fixed vector types.
20607 if (!VecVT.isFixedLengthVector() ||
20608 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20609 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20610 return SDValue();
20611
20612 // Ignore widening patterns.
20613 if (IdxVal == 0 && Vec.isUndef())
20614 return SDValue();
20615
20616 // Subvector must be half the width and an "aligned" insertion.
20617 unsigned NumSubElts = SubVT.getVectorNumElements();
20618 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20619 (IdxVal != 0 && IdxVal != NumSubElts))
20620 return SDValue();
20621
20622 // Fold insert_subvector -> concat_vectors
20623 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20624 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20625 SDValue Lo, Hi;
20626 if (IdxVal == 0) {
20627 Lo = SubVec;
20628 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20629 DAG.getVectorIdxConstant(NumSubElts, DL));
20630 } else {
20631 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20632 DAG.getVectorIdxConstant(0, DL));
20633 Hi = SubVec;
20634 }
20635 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20636}
20637
20640 SelectionDAG &DAG) {
20641 // Wait until after everything is legalized to try this. That way we have
20642 // legal vector types and such.
20643 if (DCI.isBeforeLegalizeOps())
20644 return SDValue();
20645 // Transform a scalar conversion of a value from a lane extract into a
20646 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20647 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20648 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20649 //
20650 // The second form interacts better with instruction selection and the
20651 // register allocator to avoid cross-class register copies that aren't
20652 // coalescable due to a lane reference.
20653
20654 // Check the operand and see if it originates from a lane extract.
20655 SDValue Op1 = N->getOperand(1);
20657 return SDValue();
20658
20659 // Yep, no additional predication needed. Perform the transform.
20660 SDValue IID = N->getOperand(0);
20661 SDValue Shift = N->getOperand(2);
20662 SDValue Vec = Op1.getOperand(0);
20663 SDValue Lane = Op1.getOperand(1);
20664 EVT ResTy = N->getValueType(0);
20665 EVT VecResTy;
20666 SDLoc DL(N);
20667
20668 // The vector width should be 128 bits by the time we get here, even
20669 // if it started as 64 bits (the extract_vector handling will have
20670 // done so). Bail if it is not.
20671 if (Vec.getValueSizeInBits() != 128)
20672 return SDValue();
20673
20674 if (Vec.getValueType() == MVT::v4i32)
20675 VecResTy = MVT::v4f32;
20676 else if (Vec.getValueType() == MVT::v2i64)
20677 VecResTy = MVT::v2f64;
20678 else
20679 return SDValue();
20680
20681 SDValue Convert =
20682 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20683 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20684}
20685
20686// AArch64 high-vector "long" operations are formed by performing the non-high
20687// version on an extract_subvector of each operand which gets the high half:
20688//
20689// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20690//
20691// However, there are cases which don't have an extract_high explicitly, but
20692// have another operation that can be made compatible with one for free. For
20693// example:
20694//
20695// (dupv64 scalar) --> (extract_high (dup128 scalar))
20696//
20697// This routine does the actual conversion of such DUPs, once outer routines
20698// have determined that everything else is in order.
20699// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20700// similarly here.
20702 MVT VT = N.getSimpleValueType();
20703 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20704 N.getConstantOperandVal(1) == 0)
20705 N = N.getOperand(0);
20706
20707 switch (N.getOpcode()) {
20708 case AArch64ISD::DUP:
20709 case AArch64ISD::DUPLANE8:
20710 case AArch64ISD::DUPLANE16:
20711 case AArch64ISD::DUPLANE32:
20712 case AArch64ISD::DUPLANE64:
20713 case AArch64ISD::MOVI:
20714 case AArch64ISD::MOVIshift:
20715 case AArch64ISD::MOVIedit:
20716 case AArch64ISD::MOVImsl:
20717 case AArch64ISD::MVNIshift:
20718 case AArch64ISD::MVNImsl:
20719 break;
20720 default:
20721 // FMOV could be supported, but isn't very useful, as it would only occur
20722 // if you passed a bitcast' floating point immediate to an eligible long
20723 // integer op (addl, smull, ...).
20724 return SDValue();
20725 }
20726
20727 if (!VT.is64BitVector())
20728 return SDValue();
20729
20730 SDLoc DL(N);
20731 unsigned NumElems = VT.getVectorNumElements();
20732 if (N.getValueType().is64BitVector()) {
20733 MVT ElementTy = VT.getVectorElementType();
20734 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20735 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20736 }
20737
20738 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20739 DAG.getConstant(NumElems, DL, MVT::i64));
20740}
20741
20743 if (N.getOpcode() == ISD::BITCAST)
20744 N = N.getOperand(0);
20745 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20746 return false;
20747 if (N.getOperand(0).getValueType().isScalableVector())
20748 return false;
20749 return N.getConstantOperandAPInt(1) ==
20750 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20751}
20752
20753/// Helper structure to keep track of ISD::SET_CC operands.
20758};
20759
20760/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20762 const SDValue *Cmp;
20764};
20765
20766/// Helper structure to keep track of SetCC information.
20770};
20771
20772/// Helper structure to be able to read SetCC information. If set to
20773/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20774/// GenericSetCCInfo.
20778};
20779
20780/// Check whether or not \p Op is a SET_CC operation, either a generic or
20781/// an
20782/// AArch64 lowered one.
20783/// \p SetCCInfo is filled accordingly.
20784/// \post SetCCInfo is meanginfull only when this function returns true.
20785/// \return True when Op is a kind of SET_CC operation.
20787 // If this is a setcc, this is straight forward.
20788 if (Op.getOpcode() == ISD::SETCC) {
20789 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20790 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20791 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20792 SetCCInfo.IsAArch64 = false;
20793 return true;
20794 }
20795 // Otherwise, check if this is a matching csel instruction.
20796 // In other words:
20797 // - csel 1, 0, cc
20798 // - csel 0, 1, !cc
20799 if (Op.getOpcode() != AArch64ISD::CSEL)
20800 return false;
20801 // Set the information about the operands.
20802 // TODO: we want the operands of the Cmp not the csel
20803 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20804 SetCCInfo.IsAArch64 = true;
20805 SetCCInfo.Info.AArch64.CC =
20806 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20807
20808 // Check that the operands matches the constraints:
20809 // (1) Both operands must be constants.
20810 // (2) One must be 1 and the other must be 0.
20811 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20812 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20813
20814 // Check (1).
20815 if (!TValue || !FValue)
20816 return false;
20817
20818 // Check (2).
20819 if (!TValue->isOne()) {
20820 // Update the comparison when we are interested in !cc.
20821 std::swap(TValue, FValue);
20822 SetCCInfo.Info.AArch64.CC =
20824 }
20825 return TValue->isOne() && FValue->isZero();
20826}
20827
20828// Returns true if Op is setcc or zext of setcc.
20829static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20830 if (isSetCC(Op, Info))
20831 return true;
20832 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
20833 isSetCC(Op->getOperand(0), Info));
20834}
20835
20836// The folding we want to perform is:
20837// (add x, [zext] (setcc cc ...) )
20838// -->
20839// (csel x, (add x, 1), !cc ...)
20840//
20841// The latter will get matched to a CSINC instruction.
20843 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
20844 SDValue LHS = Op->getOperand(0);
20845 SDValue RHS = Op->getOperand(1);
20846 SetCCInfoAndKind InfoAndKind;
20847
20848 // If both operands are a SET_CC, then we don't want to perform this
20849 // folding and create another csel as this results in more instructions
20850 // (and higher register usage).
20851 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
20852 isSetCCOrZExtSetCC(RHS, InfoAndKind))
20853 return SDValue();
20854
20855 // If neither operand is a SET_CC, give up.
20856 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
20857 std::swap(LHS, RHS);
20858 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
20859 return SDValue();
20860 }
20861
20862 // FIXME: This could be generatized to work for FP comparisons.
20863 EVT CmpVT = InfoAndKind.IsAArch64
20864 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
20865 : InfoAndKind.Info.Generic.Opnd0->getValueType();
20866 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
20867 return SDValue();
20868
20869 SDValue CCVal;
20870 SDValue Cmp;
20871 SDLoc DL(Op);
20872 if (InfoAndKind.IsAArch64) {
20873 CCVal = DAG.getConstant(
20875 MVT::i32);
20876 Cmp = *InfoAndKind.Info.AArch64.Cmp;
20877 } else
20878 Cmp = getAArch64Cmp(
20879 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
20880 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
20881 DL);
20882
20883 EVT VT = Op->getValueType(0);
20884 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
20885 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
20886}
20887
20888// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
20890 EVT VT = N->getValueType(0);
20891 // Only scalar integer and vector types.
20892 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
20893 return SDValue();
20894
20895 SDValue LHS = N->getOperand(0);
20896 SDValue RHS = N->getOperand(1);
20897 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20898 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
20899 return SDValue();
20900
20901 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
20902 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
20903 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
20904 return SDValue();
20905
20906 SDValue Op1 = LHS->getOperand(0);
20907 SDValue Op2 = RHS->getOperand(0);
20908 EVT OpVT1 = Op1.getValueType();
20909 EVT OpVT2 = Op2.getValueType();
20910 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
20911 Op2.getOpcode() != AArch64ISD::UADDV ||
20912 OpVT1.getVectorElementType() != VT)
20913 return SDValue();
20914
20915 SDValue Val1 = Op1.getOperand(0);
20916 SDValue Val2 = Op2.getOperand(0);
20917 EVT ValVT = Val1->getValueType(0);
20918 SDLoc DL(N);
20919 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
20920 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
20921 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
20922 DAG.getConstant(0, DL, MVT::i64));
20923}
20924
20925/// Perform the scalar expression combine in the form of:
20926/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
20927/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
20929 EVT VT = N->getValueType(0);
20930 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
20931 return SDValue();
20932
20933 SDValue LHS = N->getOperand(0);
20934 SDValue RHS = N->getOperand(1);
20935
20936 // Handle commutivity.
20937 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20938 LHS.getOpcode() != AArch64ISD::CSNEG) {
20939 std::swap(LHS, RHS);
20940 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20941 LHS.getOpcode() != AArch64ISD::CSNEG) {
20942 return SDValue();
20943 }
20944 }
20945
20946 if (!LHS.hasOneUse())
20947 return SDValue();
20948
20949 AArch64CC::CondCode AArch64CC =
20950 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
20951
20952 // The CSEL should include a const one operand, and the CSNEG should include
20953 // One or NegOne operand.
20954 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
20955 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
20956 if (!CTVal || !CFVal)
20957 return SDValue();
20958
20959 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
20960 (CTVal->isOne() || CFVal->isOne())) &&
20961 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
20962 (CTVal->isOne() || CFVal->isAllOnes())))
20963 return SDValue();
20964
20965 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
20966 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
20967 !CFVal->isOne()) {
20968 std::swap(CTVal, CFVal);
20969 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20970 }
20971
20972 SDLoc DL(N);
20973 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
20974 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
20975 !CFVal->isAllOnes()) {
20976 APInt C = -1 * CFVal->getAPIntValue();
20977 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
20978 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
20979 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20980 }
20981
20982 // It might be neutral for larger constants, as the immediate need to be
20983 // materialized in a register.
20984 APInt ADDC = CTVal->getAPIntValue();
20985 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20986 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
20987 return SDValue();
20988
20989 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
20990 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
20991 "Unexpected constant value");
20992
20993 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
20994 SDValue CCVal = getCondCode(DAG, AArch64CC);
20995 SDValue Cmp = LHS.getOperand(3);
20996
20997 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
20998}
20999
21000// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
21002 EVT VT = N->getValueType(0);
21003 if (N->getOpcode() != ISD::ADD)
21004 return SDValue();
21005
21006 SDValue Dot = N->getOperand(0);
21007 SDValue A = N->getOperand(1);
21008 // Handle commutivity
21009 auto isZeroDot = [](SDValue Dot) {
21010 return (Dot.getOpcode() == AArch64ISD::UDOT ||
21011 Dot.getOpcode() == AArch64ISD::SDOT) &&
21013 };
21014 if (!isZeroDot(Dot))
21015 std::swap(Dot, A);
21016 if (!isZeroDot(Dot))
21017 return SDValue();
21018
21019 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
21020 Dot.getOperand(2));
21021}
21022
21024 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
21025}
21026
21028 SDLoc DL(Op);
21029 EVT VT = Op.getValueType();
21030 SDValue Zero = DAG.getConstant(0, DL, VT);
21031 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
21032}
21033
21034// Try to fold
21035//
21036// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
21037//
21038// The folding helps csel to be matched with csneg without generating
21039// redundant neg instruction, which includes negation of the csel expansion
21040// of abs node lowered by lowerABS.
21042 if (!isNegatedInteger(SDValue(N, 0)))
21043 return SDValue();
21044
21045 SDValue CSel = N->getOperand(1);
21046 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
21047 return SDValue();
21048
21049 SDValue N0 = CSel.getOperand(0);
21050 SDValue N1 = CSel.getOperand(1);
21051
21052 // If both of them is not negations, it's not worth the folding as it
21053 // introduces two additional negations while reducing one negation.
21054 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
21055 return SDValue();
21056
21057 SDValue N0N = getNegatedInteger(N0, DAG);
21058 SDValue N1N = getNegatedInteger(N1, DAG);
21059
21060 SDLoc DL(N);
21061 EVT VT = CSel.getValueType();
21062 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
21063 CSel.getOperand(3));
21064}
21065
21066// The basic add/sub long vector instructions have variants with "2" on the end
21067// which act on the high-half of their inputs. They are normally matched by
21068// patterns like:
21069//
21070// (add (zeroext (extract_high LHS)),
21071// (zeroext (extract_high RHS)))
21072// -> uaddl2 vD, vN, vM
21073//
21074// However, if one of the extracts is something like a duplicate, this
21075// instruction can still be used profitably. This function puts the DAG into a
21076// more appropriate form for those patterns to trigger.
21079 SelectionDAG &DAG = DCI.DAG;
21080 if (DCI.isBeforeLegalizeOps())
21081 return SDValue();
21082
21083 MVT VT = N->getSimpleValueType(0);
21084 if (!VT.is128BitVector()) {
21085 if (N->getOpcode() == ISD::ADD)
21086 return performSetccAddFolding(N, DAG);
21087 return SDValue();
21088 }
21089
21090 // Make sure both branches are extended in the same way.
21091 SDValue LHS = N->getOperand(0);
21092 SDValue RHS = N->getOperand(1);
21093 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
21094 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
21095 LHS.getOpcode() != RHS.getOpcode())
21096 return SDValue();
21097
21098 unsigned ExtType = LHS.getOpcode();
21099
21100 // It's not worth doing if at least one of the inputs isn't already an
21101 // extract, but we don't know which it'll be so we have to try both.
21102 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
21103 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
21104 if (!RHS.getNode())
21105 return SDValue();
21106
21107 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
21108 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
21109 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
21110 if (!LHS.getNode())
21111 return SDValue();
21112
21113 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
21114 }
21115
21116 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
21117}
21118
21119static bool isCMP(SDValue Op) {
21120 return Op.getOpcode() == AArch64ISD::SUBS &&
21121 !Op.getNode()->hasAnyUseOfValue(0);
21122}
21123
21124// (CSEL 1 0 CC Cond) => CC
21125// (CSEL 0 1 CC Cond) => !CC
21126static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
21127 if (Op.getOpcode() != AArch64ISD::CSEL)
21128 return std::nullopt;
21129 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21130 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
21131 return std::nullopt;
21132 SDValue OpLHS = Op.getOperand(0);
21133 SDValue OpRHS = Op.getOperand(1);
21134 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
21135 return CC;
21136 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
21137 return getInvertedCondCode(CC);
21138
21139 return std::nullopt;
21140}
21141
21142// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
21143// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
21144static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
21145 SDValue CmpOp = Op->getOperand(2);
21146 if (!isCMP(CmpOp))
21147 return SDValue();
21148
21149 if (IsAdd) {
21150 if (!isOneConstant(CmpOp.getOperand(1)))
21151 return SDValue();
21152 } else {
21153 if (!isNullConstant(CmpOp.getOperand(0)))
21154 return SDValue();
21155 }
21156
21157 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
21158 auto CC = getCSETCondCode(CsetOp);
21159 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
21160 return SDValue();
21161
21162 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
21163 Op->getOperand(0), Op->getOperand(1),
21164 CsetOp.getOperand(3));
21165}
21166
21167// (ADC x 0 cond) => (CINC x HS cond)
21169 SDValue LHS = N->getOperand(0);
21170 SDValue RHS = N->getOperand(1);
21171 SDValue Cond = N->getOperand(2);
21172
21173 if (!isNullConstant(RHS))
21174 return SDValue();
21175
21176 EVT VT = N->getValueType(0);
21177 SDLoc DL(N);
21178
21179 // (CINC x cc cond) <=> (CSINC x x !cc cond)
21181 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
21182}
21183
21186 SelectionDAG &DAG) {
21187 SDLoc DL(N);
21188 EVT VT = N->getValueType(0);
21189
21191 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
21192 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
21193 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
21194 if (Elt0->getOpcode() == ISD::FP_ROUND &&
21195 Elt1->getOpcode() == ISD::FP_ROUND &&
21196 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21197 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21198 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
21200 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21201 // Constant index.
21202 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
21203 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21204 Elt0->getOperand(0)->getOperand(0) ==
21205 Elt1->getOperand(0)->getOperand(0) &&
21206 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
21207 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
21208 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
21209 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
21210 SDValue HighLanes;
21211 if (Elt2->getOpcode() == ISD::UNDEF &&
21212 Elt3->getOpcode() == ISD::UNDEF) {
21213 HighLanes = DAG.getUNDEF(MVT::v2f32);
21214 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
21215 Elt3->getOpcode() == ISD::FP_ROUND &&
21216 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
21217 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
21218 Elt2->getConstantOperandVal(1) ==
21219 Elt3->getConstantOperandVal(1) &&
21220 Elt2->getOperand(0)->getOpcode() ==
21222 Elt3->getOperand(0)->getOpcode() ==
21224 // Constant index.
21225 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
21226 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
21227 Elt2->getOperand(0)->getOperand(0) ==
21228 Elt3->getOperand(0)->getOperand(0) &&
21229 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
21230 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
21231 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
21232 HighLanes =
21233 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
21234 }
21235 if (HighLanes) {
21236 SDValue DoubleToSingleSticky =
21237 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
21238 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
21239 DoubleToSingleSticky, HighLanes);
21240 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
21241 Elt0->getOperand(1));
21242 }
21243 }
21244 }
21245 }
21246
21247 if (VT == MVT::v2f64) {
21248 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21249 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
21250 Elt1->getOpcode() == ISD::FP_EXTEND &&
21252 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21253 Elt0->getOperand(0)->getOperand(0) ==
21254 Elt1->getOperand(0)->getOperand(0) &&
21255 // Constant index.
21256 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
21257 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21258 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
21259 Elt1->getOperand(0)->getConstantOperandVal(1) &&
21260 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21261 // ResultType's known minimum vector length.
21262 Elt0->getOperand(0)->getConstantOperandVal(1) %
21264 0) {
21265 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
21266 if (SrcVec.getValueType() == MVT::v4f16 ||
21267 SrcVec.getValueType() == MVT::v4bf16) {
21268 SDValue HalfToSingle =
21269 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
21270 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
21271 SDValue Extract = DAG.getNode(
21273 HalfToSingle, SubvectorIdx);
21274 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
21275 }
21276 }
21277 }
21278
21279 // A build vector of two extracted elements is equivalent to an
21280 // extract subvector where the inner vector is any-extended to the
21281 // extract_vector_elt VT.
21282 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
21283 // (extract_elt_iXX_to_i32 vec Idx+1))
21284 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
21285
21286 // For now, only consider the v2i32 case, which arises as a result of
21287 // legalization.
21288 if (VT != MVT::v2i32)
21289 return SDValue();
21290
21291 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21292 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
21293 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21294 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21295 // Constant index.
21296 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21297 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21298 // Both EXTRACT_VECTOR_ELT from same vector...
21299 Elt0->getOperand(0) == Elt1->getOperand(0) &&
21300 // ... and contiguous. First element's index +1 == second element's index.
21301 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
21302 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21303 // ResultType's known minimum vector length.
21304 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
21305 SDValue VecToExtend = Elt0->getOperand(0);
21306 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
21307 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
21308 return SDValue();
21309
21310 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
21311
21312 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
21313 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
21314 SubvectorIdx);
21315 }
21316
21317 return SDValue();
21318}
21319
21320// A special combine for the sqdmulh family of instructions.
21321// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
21322// SATURATING_VAL ) can be reduced to sqdmulh(...)
21324
21325 if (N->getOpcode() != ISD::SMIN)
21326 return SDValue();
21327
21328 EVT DestVT = N->getValueType(0);
21329
21330 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
21331 DestVT.isScalableVector())
21332 return SDValue();
21333
21334 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
21335
21336 if (!Clamp)
21337 return SDValue();
21338
21339 MVT ScalarType;
21340 unsigned ShiftAmt = 0;
21341 switch (Clamp->getSExtValue()) {
21342 case (1ULL << 15) - 1:
21343 ScalarType = MVT::i16;
21344 ShiftAmt = 16;
21345 break;
21346 case (1ULL << 31) - 1:
21347 ScalarType = MVT::i32;
21348 ShiftAmt = 32;
21349 break;
21350 default:
21351 return SDValue();
21352 }
21353
21354 SDValue Sra = N->getOperand(0);
21355 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
21356 return SDValue();
21357
21358 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
21359 if (!RightShiftVec)
21360 return SDValue();
21361 unsigned SExtValue = RightShiftVec->getSExtValue();
21362
21363 if (SExtValue != (ShiftAmt - 1))
21364 return SDValue();
21365
21366 SDValue Mul = Sra.getOperand(0);
21367 if (Mul.getOpcode() != ISD::MUL)
21368 return SDValue();
21369
21370 SDValue SExt0 = Mul.getOperand(0);
21371 SDValue SExt1 = Mul.getOperand(1);
21372
21373 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
21374 SExt1.getOpcode() != ISD::SIGN_EXTEND)
21375 return SDValue();
21376
21377 EVT SExt0Type = SExt0.getOperand(0).getValueType();
21378 EVT SExt1Type = SExt1.getOperand(0).getValueType();
21379
21380 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
21381 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
21382 SExt0Type.getVectorNumElements() == 1)
21383 return SDValue();
21384
21385 SDLoc DL(N);
21386 SDValue V0 = SExt0.getOperand(0);
21387 SDValue V1 = SExt1.getOperand(0);
21388
21389 // Ensure input vectors are extended to legal types
21390 if (SExt0Type.getFixedSizeInBits() < 64) {
21391 unsigned VecNumElements = SExt0Type.getVectorNumElements();
21392 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
21393 VecNumElements);
21394 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
21395 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
21396 }
21397
21398 SDValue SQDMULH =
21399 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
21400
21401 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
21402}
21403
21405 if (SDValue V = trySQDMULHCombine(N, DAG)) {
21406 return V;
21407 }
21408
21409 return SDValue();
21410}
21411
21414 SDLoc DL(N);
21415 EVT VT = N->getValueType(0);
21416 SDValue N0 = N->getOperand(0);
21417 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
21418 N0.getOpcode() == AArch64ISD::DUP) {
21419 SDValue Op = N0.getOperand(0);
21420 if (VT.getScalarType() == MVT::i32 &&
21421 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
21422 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
21423 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
21424 }
21425
21426 // Performing the following combine produces a preferable form for ISEL.
21427 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
21429 N0.hasOneUse()) {
21430 SDValue Op = N0.getOperand(0);
21431 SDValue ExtractIndexNode = N0.getOperand(1);
21432 if (!isa<ConstantSDNode>(ExtractIndexNode))
21433 return SDValue();
21434
21435 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
21436 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
21437 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
21438 "Unexpected legalisation result!");
21439
21440 EVT SrcVectorType = Op.getValueType();
21441 // We also assume that SrcVectorType cannot be a V64 (see
21442 // LowerEXTRACT_VECTOR_ELT).
21443 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
21444 "Unexpected legalisation result!");
21445
21446 unsigned ExtractIndex =
21447 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
21448 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
21449
21450 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
21451 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
21452 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
21453 }
21454
21455 return SDValue();
21456}
21457
21458// Check an node is an extend or shift operand
21460 unsigned Opcode = N.getOpcode();
21461 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
21462 EVT SrcVT;
21463 if (Opcode == ISD::SIGN_EXTEND_INREG)
21464 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
21465 else
21466 SrcVT = N.getOperand(0).getValueType();
21467
21468 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
21469 } else if (Opcode == ISD::AND) {
21470 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
21471 if (!CSD)
21472 return false;
21473 uint64_t AndMask = CSD->getZExtValue();
21474 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
21475 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
21476 return isa<ConstantSDNode>(N.getOperand(1));
21477 }
21478
21479 return false;
21480}
21481
21482// (N - Y) + Z --> (Z - Y) + N
21483// when N is an extend or shift operand
21485 SelectionDAG &DAG) {
21486 auto IsOneUseExtend = [](SDValue N) {
21487 return N.hasOneUse() && isExtendOrShiftOperand(N);
21488 };
21489
21490 // DAGCombiner will revert the combination when Z is constant cause
21491 // dead loop. So don't enable the combination when Z is constant.
21492 // If Z is one use shift C, we also can't do the optimization.
21493 // It will falling to self infinite loop.
21494 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
21495 return SDValue();
21496
21497 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
21498 return SDValue();
21499
21500 SDValue Shift = SUB.getOperand(0);
21501 if (!IsOneUseExtend(Shift))
21502 return SDValue();
21503
21504 SDLoc DL(N);
21505 EVT VT = N->getValueType(0);
21506
21507 SDValue Y = SUB.getOperand(1);
21508 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
21509 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
21510}
21511
21513 SelectionDAG &DAG) {
21514 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21515 // commutative.
21516 if (N->getOpcode() != ISD::ADD)
21517 return SDValue();
21518
21519 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21520 // shifted register is only available for i32 and i64.
21521 EVT VT = N->getValueType(0);
21522 if (VT != MVT::i32 && VT != MVT::i64)
21523 return SDValue();
21524
21525 SDLoc DL(N);
21526 SDValue LHS = N->getOperand(0);
21527 SDValue RHS = N->getOperand(1);
21528
21529 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
21530 return Val;
21531 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
21532 return Val;
21533
21534 uint64_t LHSImm = 0, RHSImm = 0;
21535 // If both operand are shifted by imm and shift amount is not greater than 4
21536 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21537 // on RHS.
21538 //
21539 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21540 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21541 // with LSL (shift > 4). For the rest of processors, this is no-op for
21542 // performance or correctness.
21543 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
21544 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
21545 RHSImm > 4 && LHS.hasOneUse())
21546 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
21547
21548 return SDValue();
21549}
21550
21551// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21552// This reassociates it back to allow the creation of more mls instructions.
21554 if (N->getOpcode() != ISD::SUB)
21555 return SDValue();
21556
21557 SDValue Add = N->getOperand(1);
21558 SDValue X = N->getOperand(0);
21559 if (Add.getOpcode() != ISD::ADD)
21560 return SDValue();
21561
21562 if (!Add.hasOneUse())
21563 return SDValue();
21565 return SDValue();
21566
21567 SDValue M1 = Add.getOperand(0);
21568 SDValue M2 = Add.getOperand(1);
21569 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21570 M1.getOpcode() != AArch64ISD::UMULL)
21571 return SDValue();
21572 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21573 M2.getOpcode() != AArch64ISD::UMULL)
21574 return SDValue();
21575
21576 EVT VT = N->getValueType(0);
21577 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21578 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21579}
21580
21581// Combine into mla/mls.
21582// This works on the patterns of:
21583// add v1, (mul v2, v3)
21584// sub v1, (mul v2, v3)
21585// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21586// It will transform the add/sub to a scalable version, so that we can
21587// make use of SVE's MLA/MLS that will be generated for that pattern
21588static SDValue
21590 SelectionDAG &DAG = DCI.DAG;
21591 // Make sure that the types are legal
21592 if (!DCI.isAfterLegalizeDAG())
21593 return SDValue();
21594 // Before using SVE's features, check first if it's available.
21595 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21596 return SDValue();
21597
21598 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21599 return SDValue();
21600
21601 if (!N->getValueType(0).isFixedLengthVector())
21602 return SDValue();
21603
21604 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21605 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21606 return SDValue();
21607
21608 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21609 return SDValue();
21610
21611 SDValue MulValue = Op1->getOperand(0);
21612 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21613 return SDValue();
21614
21615 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21616 return SDValue();
21617
21618 EVT ScalableVT = MulValue.getValueType();
21619 if (!ScalableVT.isScalableVector())
21620 return SDValue();
21621
21622 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21623 SDValue NewValue =
21624 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21625 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21626 };
21627
21628 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21629 return res;
21630 else if (N->getOpcode() == ISD::ADD)
21631 return performOpt(N->getOperand(1), N->getOperand(0));
21632
21633 return SDValue();
21634}
21635
21636// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21637// help, for example, to produce ssra from sshr+add.
21639 EVT VT = N->getValueType(0);
21640 if (VT != MVT::i64 ||
21641 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21642 return SDValue();
21643 SDValue Op0 = N->getOperand(0);
21644 SDValue Op1 = N->getOperand(1);
21645
21646 // At least one of the operands should be an extract, and the other should be
21647 // something that is easy to convert to v1i64 type (in this case a load).
21648 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21649 Op0.getOpcode() != ISD::LOAD)
21650 return SDValue();
21651 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21652 Op1.getOpcode() != ISD::LOAD)
21653 return SDValue();
21654
21655 SDLoc DL(N);
21656 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21657 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21658 Op0 = Op0.getOperand(0);
21659 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21660 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21661 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21662 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21663 Op1 = Op1.getOperand(0);
21664 } else
21665 return SDValue();
21666
21667 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21668 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21669 DAG.getConstant(0, DL, MVT::i64));
21670}
21671
21674 if (!BV->hasOneUse())
21675 return false;
21676 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21677 if (!Ld || !Ld->isSimple())
21678 return false;
21679 Loads.push_back(Ld);
21680 return true;
21681 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21683 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21684 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
21685 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
21686 return false;
21687 Loads.push_back(Ld);
21688 }
21689 return true;
21690 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21691 // Try to find a tree of shuffles and concats from how IR shuffles of loads
21692 // are lowered. Note that this only comes up because we do not always visit
21693 // operands before uses. After that is fixed this can be removed and in the
21694 // meantime this is fairly specific to the lowering we expect from IR.
21695 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21696 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21697 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21698 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21699 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21700 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21701 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21702 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21703 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21704 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21705 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
21706 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21707 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21708 B.getOperand(1).getNumOperands() != 4)
21709 return false;
21710 auto SV1 = cast<ShuffleVectorSDNode>(B);
21711 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
21712 int NumElts = B.getValueType().getVectorNumElements();
21713 int NumSubElts = NumElts / 4;
21714 for (int I = 0; I < NumSubElts; I++) {
21715 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21716 if (SV1->getMaskElt(I) != I ||
21717 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21718 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
21719 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
21720 return false;
21721 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21722 if (SV2->getMaskElt(I) != I ||
21723 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21724 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
21725 return false;
21726 }
21727 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
21728 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
21729 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
21730 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
21731 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21732 !Ld2->isSimple() || !Ld3->isSimple())
21733 return false;
21734 Loads.push_back(Ld0);
21735 Loads.push_back(Ld1);
21736 Loads.push_back(Ld2);
21737 Loads.push_back(Ld3);
21738 return true;
21739 }
21740 return false;
21741}
21742
21744 SelectionDAG &DAG,
21745 unsigned &NumSubLoads) {
21746 if (!Op0.hasOneUse() || !Op1.hasOneUse())
21747 return false;
21748
21749 SmallVector<LoadSDNode *> Loads0, Loads1;
21750 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21751 isLoadOrMultipleLoads(Op1, Loads1)) {
21752 if (NumSubLoads && Loads0.size() != NumSubLoads)
21753 return false;
21754 NumSubLoads = Loads0.size();
21755 return Loads0.size() == Loads1.size() &&
21756 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
21757 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
21758 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
21759 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
21760 Size / 8, 1);
21761 });
21762 }
21763
21764 if (Op0.getOpcode() != Op1.getOpcode())
21765 return false;
21766
21767 switch (Op0.getOpcode()) {
21768 case ISD::ADD:
21769 case ISD::SUB:
21771 DAG, NumSubLoads) &&
21773 DAG, NumSubLoads);
21774 case ISD::SIGN_EXTEND:
21775 case ISD::ANY_EXTEND:
21776 case ISD::ZERO_EXTEND:
21777 EVT XVT = Op0.getOperand(0).getValueType();
21778 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
21779 XVT.getScalarSizeInBits() != 32)
21780 return false;
21782 DAG, NumSubLoads);
21783 }
21784 return false;
21785}
21786
21787// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21788// into a single load of twice the size, that we extract the bottom part and top
21789// part so that the shl can use a shll2 instruction. The two loads in that
21790// example can also be larger trees of instructions, which are identical except
21791// for the leaves which are all loads offset from the LHS, including
21792// buildvectors of multiple loads. For example the RHS tree could be
21793// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21794// Whilst it can be common for the larger loads to replace LDP instructions
21795// (which doesn't gain anything on it's own), the larger loads can help create
21796// more efficient code, and in buildvectors prevent the need for ld1 lane
21797// inserts which can be slower than normal loads.
21799 EVT VT = N->getValueType(0);
21800 if (!VT.isFixedLengthVector() ||
21801 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
21802 VT.getScalarSizeInBits() != 64))
21803 return SDValue();
21804
21805 SDValue Other = N->getOperand(0);
21806 SDValue Shift = N->getOperand(1);
21807 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21808 std::swap(Shift, Other);
21809 APInt ShiftAmt;
21810 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
21811 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
21812 return SDValue();
21813
21814 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
21815 !ISD::isExtOpcode(Other.getOpcode()) ||
21816 Shift.getOperand(0).getOperand(0).getValueType() !=
21817 Other.getOperand(0).getValueType() ||
21818 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
21819 return SDValue();
21820
21821 SDValue Op0 = Other.getOperand(0);
21822 SDValue Op1 = Shift.getOperand(0).getOperand(0);
21823
21824 unsigned NumSubLoads = 0;
21825 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21826 return SDValue();
21827
21828 // Attempt to rule out some unprofitable cases using heuristics (some working
21829 // around suboptimal code generation), notably if the extend not be able to
21830 // use ushll2 instructions as the types are not large enough. Otherwise zip's
21831 // will need to be created which can increase the instruction count.
21832 unsigned NumElts = Op0.getValueType().getVectorNumElements();
21833 unsigned NumSubElts = NumElts / NumSubLoads;
21834 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
21835 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
21836 Op0.getValueType().getSizeInBits() < 128 &&
21838 return SDValue();
21839
21840 // Recreate the tree with the new combined loads.
21841 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
21842 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
21843 EVT DVT =
21845
21846 SmallVector<LoadSDNode *> Loads0, Loads1;
21847 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21848 isLoadOrMultipleLoads(Op1, Loads1)) {
21849 EVT LoadVT = EVT::getVectorVT(
21850 *DAG.getContext(), Op0.getValueType().getScalarType(),
21851 Op0.getValueType().getVectorNumElements() / Loads0.size());
21852 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21853
21854 SmallVector<SDValue> NewLoads;
21855 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
21856 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
21857 L0->getBasePtr(), L0->getPointerInfo(),
21858 L0->getBaseAlign());
21859 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
21860 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
21861 NewLoads.push_back(Load);
21862 }
21863 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
21864 }
21865
21867 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
21868 Ops.push_back(GenCombinedTree(O0, O1, DAG));
21869 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
21870 };
21871 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
21872
21873 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
21874 int Hi = NumSubElts, Lo = 0;
21875 for (unsigned i = 0; i < NumSubLoads; i++) {
21876 for (unsigned j = 0; j < NumSubElts; j++) {
21877 LowMask[i * NumSubElts + j] = Lo++;
21878 HighMask[i * NumSubElts + j] = Hi++;
21879 }
21880 Lo += NumSubElts;
21881 Hi += NumSubElts;
21882 }
21883 SDLoc DL(N);
21884 SDValue Ext0, Ext1;
21885 // Extract the top and bottom lanes, then extend the result. Possibly extend
21886 // the result then extract the lanes if the two operands match as it produces
21887 // slightly smaller code.
21888 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
21890 NewOp, DAG.getConstant(0, DL, MVT::i64));
21891 SDValue SubH =
21892 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
21893 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21894 SDValue Extr0 =
21895 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
21896 SDValue Extr1 =
21897 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
21898 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
21899 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
21900 } else {
21902 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
21903 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21904 DAG.getConstant(0, DL, MVT::i64));
21905 SDValue SubH =
21906 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21907 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21908 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
21909 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
21910 }
21911 SDValue NShift =
21912 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
21913 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
21914}
21915
21918 // Try to change sum of two reductions.
21919 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
21920 return Val;
21921 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
21922 return Val;
21923 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
21924 return Val;
21925 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
21926 return Val;
21927 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
21928 return Val;
21930 return Val;
21931 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
21932 return Val;
21933 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
21934 return Val;
21935 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
21936 return Val;
21937
21938 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
21939 return Val;
21940
21941 return performAddSubLongCombine(N, DCI);
21942}
21943
21944// Massage DAGs which we can use the high-half "long" operations on into
21945// something isel will recognize better. E.g.
21946//
21947// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
21948// (aarch64_neon_umull (extract_high (v2i64 vec)))
21949// (extract_high (v2i64 (dup128 scalar)))))
21950//
21953 SelectionDAG &DAG) {
21954 if (DCI.isBeforeLegalizeOps())
21955 return SDValue();
21956
21957 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
21958 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
21959 assert(LHS.getValueType().is64BitVector() &&
21960 RHS.getValueType().is64BitVector() &&
21961 "unexpected shape for long operation");
21962
21963 // Either node could be a DUP, but it's not worth doing both of them (you'd
21964 // just as well use the non-high version) so look for a corresponding extract
21965 // operation on the other "wing".
21968 if (!RHS.getNode())
21969 return SDValue();
21972 if (!LHS.getNode())
21973 return SDValue();
21974 } else
21975 return SDValue();
21976
21977 if (IID == Intrinsic::not_intrinsic)
21978 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
21979
21980 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
21981 N->getOperand(0), LHS, RHS);
21982}
21983
21984static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21985 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
21986 unsigned ElemBits = ElemTy.getSizeInBits();
21987
21988 int64_t ShiftAmount;
21989 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
21990 APInt SplatValue, SplatUndef;
21991 unsigned SplatBitSize;
21992 bool HasAnyUndefs;
21993 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
21994 HasAnyUndefs, ElemBits) ||
21995 SplatBitSize != ElemBits)
21996 return SDValue();
21997
21998 ShiftAmount = SplatValue.getSExtValue();
21999 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
22000 ShiftAmount = CVN->getSExtValue();
22001 } else
22002 return SDValue();
22003
22004 // If the shift amount is zero, remove the shift intrinsic.
22005 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
22006 return N->getOperand(1);
22007
22008 unsigned Opcode;
22009 bool IsRightShift;
22010 switch (IID) {
22011 default:
22012 llvm_unreachable("Unknown shift intrinsic");
22013 case Intrinsic::aarch64_neon_sqshl:
22014 Opcode = AArch64ISD::SQSHL_I;
22015 IsRightShift = false;
22016 break;
22017 case Intrinsic::aarch64_neon_uqshl:
22018 Opcode = AArch64ISD::UQSHL_I;
22019 IsRightShift = false;
22020 break;
22021 case Intrinsic::aarch64_neon_srshl:
22022 Opcode = AArch64ISD::SRSHR_I;
22023 IsRightShift = true;
22024 break;
22025 case Intrinsic::aarch64_neon_urshl:
22026 Opcode = AArch64ISD::URSHR_I;
22027 IsRightShift = true;
22028 break;
22029 case Intrinsic::aarch64_neon_sqshlu:
22030 Opcode = AArch64ISD::SQSHLU_I;
22031 IsRightShift = false;
22032 break;
22033 case Intrinsic::aarch64_neon_sshl:
22034 case Intrinsic::aarch64_neon_ushl:
22035 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
22036 // left shift for positive shift amounts. For negative shifts we can use a
22037 // VASHR/VLSHR as appropriate.
22038 if (ShiftAmount < 0) {
22039 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
22040 : AArch64ISD::VLSHR;
22041 ShiftAmount = -ShiftAmount;
22042 } else
22043 Opcode = AArch64ISD::VSHL;
22044 IsRightShift = false;
22045 break;
22046 }
22047
22048 EVT VT = N->getValueType(0);
22049 SDValue Op = N->getOperand(1);
22050 SDLoc DL(N);
22051 if (VT == MVT::i64) {
22052 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
22053 VT = MVT::v1i64;
22054 }
22055
22056 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
22057 Op = DAG.getNode(Opcode, DL, VT, Op,
22058 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32));
22059 if (N->getValueType(0) == MVT::i64)
22060 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22061 DAG.getConstant(0, DL, MVT::i64));
22062 return Op;
22063 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
22064 Op = DAG.getNode(Opcode, DL, VT, Op,
22065 DAG.getConstant(ShiftAmount, DL, MVT::i32));
22066 if (N->getValueType(0) == MVT::i64)
22067 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22068 DAG.getConstant(0, DL, MVT::i64));
22069 return Op;
22070 }
22071
22072 return SDValue();
22073}
22074
22075// The CRC32[BH] instructions ignore the high bits of their data operand. Since
22076// the intrinsics must be legal and take an i32, this means there's almost
22077// certainly going to be a zext in the DAG which we can eliminate.
22078static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
22079 SDValue AndN = N->getOperand(2);
22080 if (AndN.getOpcode() != ISD::AND)
22081 return SDValue();
22082
22083 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
22084 if (!CMask || CMask->getZExtValue() != Mask)
22085 return SDValue();
22086
22087 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
22088 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
22089}
22090
22092 SelectionDAG &DAG) {
22093 SDLoc DL(N);
22094 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
22095 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
22096 N->getOperand(1)),
22097 DAG.getConstant(0, DL, MVT::i64));
22098}
22099
22101 SDLoc DL(N);
22102 SDValue Op1 = N->getOperand(1);
22103 SDValue Op2 = N->getOperand(2);
22104 EVT ScalarTy = Op2.getValueType();
22105 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22106 ScalarTy = MVT::i32;
22107
22108 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
22109 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
22110 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
22111 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
22112 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
22113 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
22114}
22115
22117 SDLoc DL(N);
22118 SDValue Scalar = N->getOperand(3);
22119 EVT ScalarTy = Scalar.getValueType();
22120
22121 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22122 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
22123
22124 SDValue Passthru = N->getOperand(1);
22125 SDValue Pred = N->getOperand(2);
22126 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
22127 Pred, Scalar, Passthru);
22128}
22129
22131 SDLoc DL(N);
22132 LLVMContext &Ctx = *DAG.getContext();
22133 EVT VT = N->getValueType(0);
22134
22135 assert(VT.isScalableVector() && "Expected a scalable vector.");
22136
22137 // Current lowering only supports the SVE-ACLE types.
22139 return SDValue();
22140
22141 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
22142 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
22143 EVT ByteVT =
22144 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
22145
22146 // Convert everything to the domain of EXT (i.e bytes).
22147 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
22148 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
22149 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
22150 DAG.getConstant(ElemSize, DL, MVT::i32));
22151
22152 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
22153 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
22154}
22155
22158 SelectionDAG &DAG) {
22159 if (DCI.isBeforeLegalize())
22160 return SDValue();
22161
22162 SDValue Comparator = N->getOperand(3);
22163 if (Comparator.getOpcode() == AArch64ISD::DUP ||
22164 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
22165 unsigned IID = getIntrinsicID(N);
22166 EVT VT = N->getValueType(0);
22167 EVT CmpVT = N->getOperand(2).getValueType();
22168 SDValue Pred = N->getOperand(1);
22169 SDValue Imm;
22170 SDLoc DL(N);
22171
22172 switch (IID) {
22173 default:
22174 llvm_unreachable("Called with wrong intrinsic!");
22175 break;
22176
22177 // Signed comparisons
22178 case Intrinsic::aarch64_sve_cmpeq_wide:
22179 case Intrinsic::aarch64_sve_cmpne_wide:
22180 case Intrinsic::aarch64_sve_cmpge_wide:
22181 case Intrinsic::aarch64_sve_cmpgt_wide:
22182 case Intrinsic::aarch64_sve_cmplt_wide:
22183 case Intrinsic::aarch64_sve_cmple_wide: {
22184 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22185 int64_t ImmVal = CN->getSExtValue();
22186 if (ImmVal >= -16 && ImmVal <= 15)
22187 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
22188 else
22189 return SDValue();
22190 }
22191 break;
22192 }
22193 // Unsigned comparisons
22194 case Intrinsic::aarch64_sve_cmphs_wide:
22195 case Intrinsic::aarch64_sve_cmphi_wide:
22196 case Intrinsic::aarch64_sve_cmplo_wide:
22197 case Intrinsic::aarch64_sve_cmpls_wide: {
22198 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22199 uint64_t ImmVal = CN->getZExtValue();
22200 if (ImmVal <= 127)
22201 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
22202 else
22203 return SDValue();
22204 }
22205 break;
22206 }
22207 }
22208
22209 if (!Imm)
22210 return SDValue();
22211
22212 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
22213 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
22214 N->getOperand(2), Splat, DAG.getCondCode(CC));
22215 }
22216
22217 return SDValue();
22218}
22219
22222 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22223
22224 SDLoc DL(Op);
22225 assert(Op.getValueType().isScalableVector() &&
22226 TLI.isTypeLegal(Op.getValueType()) &&
22227 "Expected legal scalable vector type!");
22228 assert(Op.getValueType() == Pg.getValueType() &&
22229 "Expected same type for PTEST operands");
22230
22231 // Ensure target specific opcodes are using legal type.
22232 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
22233 SDValue TVal = DAG.getConstant(1, DL, OutVT);
22234 SDValue FVal = DAG.getConstant(0, DL, OutVT);
22235
22236 // Ensure operands have type nxv16i1.
22237 if (Op.getValueType() != MVT::nxv16i1) {
22240 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
22241 else
22242 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
22243 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
22244 }
22245
22246 // Set condition code (CC) flags.
22247 SDValue Test = DAG.getNode(
22248 Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
22249 DL, MVT::i32, Pg, Op);
22250
22251 // Convert CC to integer based on requested condition.
22252 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
22253 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
22254 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
22255 return DAG.getZExtOrTrunc(Res, DL, VT);
22256}
22257
22259 SelectionDAG &DAG) {
22260 SDLoc DL(N);
22261
22262 SDValue Pred = N->getOperand(1);
22263 SDValue VecToReduce = N->getOperand(2);
22264
22265 // NOTE: The integer reduction's result type is not always linked to the
22266 // operand's element type so we construct it from the intrinsic's result type.
22267 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
22268 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22269
22270 // SVE reductions set the whole vector register with the first element
22271 // containing the reduction result, which we'll now extract.
22272 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22273 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22274 Zero);
22275}
22276
22278 SelectionDAG &DAG) {
22279 SDLoc DL(N);
22280
22281 SDValue Pred = N->getOperand(1);
22282 SDValue VecToReduce = N->getOperand(2);
22283
22284 EVT ReduceVT = VecToReduce.getValueType();
22285 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22286
22287 // SVE reductions set the whole vector register with the first element
22288 // containing the reduction result, which we'll now extract.
22289 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22290 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22291 Zero);
22292}
22293
22295 SelectionDAG &DAG) {
22296 SDLoc DL(N);
22297
22298 SDValue Pred = N->getOperand(1);
22299 SDValue InitVal = N->getOperand(2);
22300 SDValue VecToReduce = N->getOperand(3);
22301 EVT ReduceVT = VecToReduce.getValueType();
22302
22303 // Ordered reductions use the first lane of the result vector as the
22304 // reduction's initial value.
22305 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22306 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
22307 DAG.getUNDEF(ReduceVT), InitVal, Zero);
22308
22309 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
22310
22311 // SVE reductions set the whole vector register with the first element
22312 // containing the reduction result, which we'll now extract.
22313 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22314 Zero);
22315}
22316
22318 SelectionDAG &DAG) {
22319 if (N->getValueType(0) != MVT::i16)
22320 return SDValue();
22321
22322 SDLoc DL(N);
22323 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
22324 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
22325 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
22326}
22327
22328// If a merged operation has no inactive lanes we can relax it to a predicated
22329// or unpredicated operation, which potentially allows better isel (perhaps
22330// using immediate forms) or relaxing register reuse requirements.
22332 SelectionDAG &DAG, bool UnpredOp = false,
22333 bool SwapOperands = false) {
22334 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
22335 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
22336 SDValue Pg = N->getOperand(1);
22337 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
22338 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
22339
22340 // ISD way to specify an all active predicate.
22341 if (isAllActivePredicate(DAG, Pg)) {
22342 if (UnpredOp)
22343 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
22344
22345 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
22346 }
22347
22348 // FUTURE: SplatVector(true)
22349 return SDValue();
22350}
22351
22353 const AArch64Subtarget *Subtarget,
22354 SelectionDAG &DAG) {
22355
22356 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
22357 getIntrinsicID(N) ==
22358 Intrinsic::experimental_vector_partial_reduce_add &&
22359 "Expected a partial reduction node");
22360
22361 bool Scalable = N->getValueType(0).isScalableVector();
22362 if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
22363 return SDValue();
22364 if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
22365 return SDValue();
22366
22367 SDLoc DL(N);
22368
22369 SDValue Op2 = N->getOperand(2);
22370 unsigned Op2Opcode = Op2->getOpcode();
22371 SDValue MulOpLHS, MulOpRHS;
22372 bool MulOpLHSIsSigned, MulOpRHSIsSigned;
22373 if (ISD::isExtOpcode(Op2Opcode)) {
22374 MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
22375 MulOpLHS = Op2->getOperand(0);
22376 MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
22377 } else if (Op2Opcode == ISD::MUL) {
22378 SDValue ExtMulOpLHS = Op2->getOperand(0);
22379 SDValue ExtMulOpRHS = Op2->getOperand(1);
22380
22381 unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
22382 unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
22383 if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
22384 !ISD::isExtOpcode(ExtMulOpRHSOpcode))
22385 return SDValue();
22386
22387 MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
22388 MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
22389
22390 MulOpLHS = ExtMulOpLHS->getOperand(0);
22391 MulOpRHS = ExtMulOpRHS->getOperand(0);
22392
22393 if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
22394 return SDValue();
22395 } else
22396 return SDValue();
22397
22398 SDValue Acc = N->getOperand(1);
22399 EVT ReducedVT = N->getValueType(0);
22400 EVT MulSrcVT = MulOpLHS.getValueType();
22401
22402 // Dot products operate on chunks of four elements so there must be four times
22403 // as many elements in the wide type
22404 if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
22405 !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
22406 !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
22407 !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
22408 !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
22409 !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
22410 return SDValue();
22411
22412 // If the extensions are mixed, we should lower it to a usdot instead
22413 unsigned Opcode = 0;
22414 if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
22415 if (!Subtarget->hasMatMulInt8())
22416 return SDValue();
22417
22418 bool Scalable = N->getValueType(0).isScalableVT();
22419 // There's no nxv2i64 version of usdot
22420 if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
22421 return SDValue();
22422
22423 Opcode = AArch64ISD::USDOT;
22424 // USDOT expects the signed operand to be last
22425 if (!MulOpRHSIsSigned)
22426 std::swap(MulOpLHS, MulOpRHS);
22427 } else
22428 Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
22429
22430 // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
22431 // product followed by a zero / sign extension
22432 if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
22433 (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
22434 EVT ReducedVTI32 =
22435 (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
22436
22437 SDValue DotI32 =
22438 DAG.getNode(Opcode, DL, ReducedVTI32,
22439 DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
22440 SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
22441 return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
22442 }
22443
22444 return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
22445}
22446
22448 const AArch64Subtarget *Subtarget,
22449 SelectionDAG &DAG) {
22450
22451 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
22452 getIntrinsicID(N) ==
22453 Intrinsic::experimental_vector_partial_reduce_add &&
22454 "Expected a partial reduction node");
22455
22456 if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
22457 return SDValue();
22458
22459 SDLoc DL(N);
22460
22461 if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
22462 return SDValue();
22463 SDValue Acc = N->getOperand(1);
22464 SDValue Ext = N->getOperand(2);
22465 EVT AccVT = Acc.getValueType();
22466 EVT ExtVT = Ext.getValueType();
22467 if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
22468 return SDValue();
22469
22470 SDValue ExtOp = Ext->getOperand(0);
22471 EVT ExtOpVT = ExtOp.getValueType();
22472
22473 if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
22474 !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
22475 !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
22476 return SDValue();
22477
22478 bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
22479 unsigned BottomOpcode =
22480 ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
22481 unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
22482 SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
22483 return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
22484}
22485
22486static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22487 SDLoc DL(N);
22488 EVT VT = N->getValueType(0);
22489 SDValue Op1 = N->getOperand(1);
22490 SDValue Op2 = N->getOperand(2);
22491 SDValue Op3 = N->getOperand(3);
22492
22493 switch (IID) {
22494 default:
22495 llvm_unreachable("Called with wrong intrinsic!");
22496 case Intrinsic::aarch64_sve_bsl:
22497 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
22498 case Intrinsic::aarch64_sve_bsl1n:
22499 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
22500 Op2);
22501 case Intrinsic::aarch64_sve_bsl2n:
22502 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
22503 DAG.getNOT(DL, Op2, VT));
22504 case Intrinsic::aarch64_sve_nbsl:
22505 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
22506 VT);
22507 }
22508}
22509
22512 const AArch64Subtarget *Subtarget) {
22513 SelectionDAG &DAG = DCI.DAG;
22514 unsigned IID = getIntrinsicID(N);
22515 switch (IID) {
22516 default:
22517 break;
22518 case Intrinsic::experimental_vector_partial_reduce_add: {
22519 if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
22520 return Dot;
22521 if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
22522 return WideAdd;
22523 SDLoc DL(N);
22524 SDValue Input = N->getOperand(2);
22525 return DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, N->getValueType(0),
22526 N->getOperand(1), Input,
22527 DAG.getConstant(1, DL, Input.getValueType()));
22528 }
22529 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22530 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22531 return tryCombineFixedPointConvert(N, DCI, DAG);
22532 case Intrinsic::aarch64_neon_saddv:
22533 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
22534 case Intrinsic::aarch64_neon_uaddv:
22535 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
22536 case Intrinsic::aarch64_neon_sminv:
22537 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
22538 case Intrinsic::aarch64_neon_uminv:
22539 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
22540 case Intrinsic::aarch64_neon_smaxv:
22541 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
22542 case Intrinsic::aarch64_neon_umaxv:
22543 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
22544 case Intrinsic::aarch64_neon_fmax:
22545 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
22546 N->getOperand(1), N->getOperand(2));
22547 case Intrinsic::aarch64_neon_fmin:
22548 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22549 N->getOperand(1), N->getOperand(2));
22550 case Intrinsic::aarch64_neon_fmaxnm:
22551 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22552 N->getOperand(1), N->getOperand(2));
22553 case Intrinsic::aarch64_neon_fminnm:
22554 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22555 N->getOperand(1), N->getOperand(2));
22556 case Intrinsic::aarch64_neon_smull:
22557 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22558 N->getOperand(1), N->getOperand(2));
22559 case Intrinsic::aarch64_neon_umull:
22560 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22561 N->getOperand(1), N->getOperand(2));
22562 case Intrinsic::aarch64_neon_pmull:
22563 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22564 N->getOperand(1), N->getOperand(2));
22565 case Intrinsic::aarch64_neon_sqdmull:
22566 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22567 case Intrinsic::aarch64_neon_sqshl:
22568 case Intrinsic::aarch64_neon_uqshl:
22569 case Intrinsic::aarch64_neon_sqshlu:
22570 case Intrinsic::aarch64_neon_srshl:
22571 case Intrinsic::aarch64_neon_urshl:
22572 case Intrinsic::aarch64_neon_sshl:
22573 case Intrinsic::aarch64_neon_ushl:
22574 return tryCombineShiftImm(IID, N, DAG);
22575 case Intrinsic::aarch64_neon_sabd:
22576 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22577 N->getOperand(1), N->getOperand(2));
22578 case Intrinsic::aarch64_neon_uabd:
22579 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22580 N->getOperand(1), N->getOperand(2));
22581 case Intrinsic::aarch64_neon_fcvtzs:
22582 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
22583 case Intrinsic::aarch64_neon_fcvtzu:
22584 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
22585 case Intrinsic::aarch64_neon_fcvtas:
22586 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
22587 case Intrinsic::aarch64_neon_fcvtau:
22588 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
22589 case Intrinsic::aarch64_neon_fcvtms:
22590 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
22591 case Intrinsic::aarch64_neon_fcvtmu:
22592 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
22593 case Intrinsic::aarch64_neon_fcvtns:
22594 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
22595 case Intrinsic::aarch64_neon_fcvtnu:
22596 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
22597 case Intrinsic::aarch64_neon_fcvtps:
22598 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
22599 case Intrinsic::aarch64_neon_fcvtpu:
22600 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
22601 case Intrinsic::aarch64_crc32b:
22602 case Intrinsic::aarch64_crc32cb:
22603 return tryCombineCRC32(0xff, N, DAG);
22604 case Intrinsic::aarch64_crc32h:
22605 case Intrinsic::aarch64_crc32ch:
22606 return tryCombineCRC32(0xffff, N, DAG);
22607 case Intrinsic::aarch64_sve_saddv:
22608 // There is no i64 version of SADDV because the sign is irrelevant.
22609 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
22610 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22611 else
22612 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
22613 case Intrinsic::aarch64_sve_uaddv:
22614 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22615 case Intrinsic::aarch64_sve_smaxv:
22616 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
22617 case Intrinsic::aarch64_sve_umaxv:
22618 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
22619 case Intrinsic::aarch64_sve_sminv:
22620 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
22621 case Intrinsic::aarch64_sve_uminv:
22622 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
22623 case Intrinsic::aarch64_sve_orv:
22624 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
22625 case Intrinsic::aarch64_sve_eorv:
22626 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
22627 case Intrinsic::aarch64_sve_andv:
22628 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
22629 case Intrinsic::aarch64_sve_index:
22630 return LowerSVEIntrinsicIndex(N, DAG);
22631 case Intrinsic::aarch64_sve_dup:
22632 return LowerSVEIntrinsicDUP(N, DAG);
22633 case Intrinsic::aarch64_sve_dup_x:
22634 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22635 N->getOperand(1));
22636 case Intrinsic::aarch64_sve_ext:
22637 return LowerSVEIntrinsicEXT(N, DAG);
22638 case Intrinsic::aarch64_sve_mul_u:
22639 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22640 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22641 case Intrinsic::aarch64_sve_smulh_u:
22642 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22643 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22644 case Intrinsic::aarch64_sve_umulh_u:
22645 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22646 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22647 case Intrinsic::aarch64_sve_smin_u:
22648 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22649 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22650 case Intrinsic::aarch64_sve_umin_u:
22651 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22652 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22653 case Intrinsic::aarch64_sve_smax_u:
22654 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22655 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22656 case Intrinsic::aarch64_sve_umax_u:
22657 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22658 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22659 case Intrinsic::aarch64_sve_lsl_u:
22660 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22661 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22662 case Intrinsic::aarch64_sve_lsr_u:
22663 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22664 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22665 case Intrinsic::aarch64_sve_asr_u:
22666 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22667 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22668 case Intrinsic::aarch64_sve_fadd_u:
22669 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22670 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22671 case Intrinsic::aarch64_sve_fdiv_u:
22672 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22673 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22674 case Intrinsic::aarch64_sve_fmax_u:
22675 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22676 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22677 case Intrinsic::aarch64_sve_fmaxnm_u:
22678 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22679 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22680 case Intrinsic::aarch64_sve_fmla_u:
22681 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22682 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22683 N->getOperand(2));
22684 case Intrinsic::aarch64_sve_fmin_u:
22685 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22686 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22687 case Intrinsic::aarch64_sve_fminnm_u:
22688 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22689 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22690 case Intrinsic::aarch64_sve_fmul_u:
22691 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22692 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22693 case Intrinsic::aarch64_sve_fsub_u:
22694 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22695 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22696 case Intrinsic::aarch64_sve_add_u:
22697 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22698 N->getOperand(3));
22699 case Intrinsic::aarch64_sve_sub_u:
22700 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22701 N->getOperand(3));
22702 case Intrinsic::aarch64_sve_subr:
22703 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22704 case Intrinsic::aarch64_sve_and_u:
22705 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22706 N->getOperand(3));
22707 case Intrinsic::aarch64_sve_bic_u:
22708 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22709 N->getOperand(2), N->getOperand(3));
22710 case Intrinsic::aarch64_sve_saddwb:
22711 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22712 N->getOperand(1), N->getOperand(2));
22713 case Intrinsic::aarch64_sve_saddwt:
22714 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22715 N->getOperand(1), N->getOperand(2));
22716 case Intrinsic::aarch64_sve_uaddwb:
22717 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22718 N->getOperand(1), N->getOperand(2));
22719 case Intrinsic::aarch64_sve_uaddwt:
22720 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22721 N->getOperand(1), N->getOperand(2));
22722 case Intrinsic::aarch64_sve_eor_u:
22723 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22724 N->getOperand(3));
22725 case Intrinsic::aarch64_sve_orr_u:
22726 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22727 N->getOperand(3));
22728 case Intrinsic::aarch64_sve_sabd_u:
22729 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22730 N->getOperand(2), N->getOperand(3));
22731 case Intrinsic::aarch64_sve_uabd_u:
22732 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22733 N->getOperand(2), N->getOperand(3));
22734 case Intrinsic::aarch64_sve_sdiv_u:
22735 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22736 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22737 case Intrinsic::aarch64_sve_udiv_u:
22738 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22739 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22740 case Intrinsic::aarch64_sve_sqadd:
22741 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22742 case Intrinsic::aarch64_sve_sqsub_u:
22743 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22744 N->getOperand(2), N->getOperand(3));
22745 case Intrinsic::aarch64_sve_uqadd:
22746 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22747 case Intrinsic::aarch64_sve_uqsub_u:
22748 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22749 N->getOperand(2), N->getOperand(3));
22750 case Intrinsic::aarch64_sve_sqadd_x:
22751 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22752 N->getOperand(1), N->getOperand(2));
22753 case Intrinsic::aarch64_sve_sqsub_x:
22754 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22755 N->getOperand(1), N->getOperand(2));
22756 case Intrinsic::aarch64_sve_uqadd_x:
22757 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22758 N->getOperand(1), N->getOperand(2));
22759 case Intrinsic::aarch64_sve_uqsub_x:
22760 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22761 N->getOperand(1), N->getOperand(2));
22762 case Intrinsic::aarch64_sve_asrd:
22763 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22764 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22765 case Intrinsic::aarch64_sve_cmphs:
22766 if (!N->getOperand(2).getValueType().isFloatingPoint())
22767 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22768 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22769 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22770 break;
22771 case Intrinsic::aarch64_sve_cmphi:
22772 if (!N->getOperand(2).getValueType().isFloatingPoint())
22773 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22774 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22775 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22776 break;
22777 case Intrinsic::aarch64_sve_fcmpge:
22778 case Intrinsic::aarch64_sve_cmpge:
22779 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22780 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22781 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22782 break;
22783 case Intrinsic::aarch64_sve_fcmpgt:
22784 case Intrinsic::aarch64_sve_cmpgt:
22785 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22786 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22787 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22788 break;
22789 case Intrinsic::aarch64_sve_fcmpeq:
22790 case Intrinsic::aarch64_sve_cmpeq:
22791 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22792 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22793 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22794 break;
22795 case Intrinsic::aarch64_sve_fcmpne:
22796 case Intrinsic::aarch64_sve_cmpne:
22797 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22798 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22799 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22800 break;
22801 case Intrinsic::aarch64_sve_fcmpuo:
22802 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22803 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22804 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22805 break;
22806 case Intrinsic::aarch64_sve_fadda:
22807 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
22808 case Intrinsic::aarch64_sve_faddv:
22809 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
22810 case Intrinsic::aarch64_sve_fmaxnmv:
22811 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
22812 case Intrinsic::aarch64_sve_fmaxv:
22813 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
22814 case Intrinsic::aarch64_sve_fminnmv:
22815 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
22816 case Intrinsic::aarch64_sve_fminv:
22817 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
22818 case Intrinsic::aarch64_sve_sel:
22819 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22820 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22821 case Intrinsic::aarch64_sve_cmpeq_wide:
22822 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
22823 case Intrinsic::aarch64_sve_cmpne_wide:
22824 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
22825 case Intrinsic::aarch64_sve_cmpge_wide:
22826 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
22827 case Intrinsic::aarch64_sve_cmpgt_wide:
22828 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
22829 case Intrinsic::aarch64_sve_cmplt_wide:
22830 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
22831 case Intrinsic::aarch64_sve_cmple_wide:
22832 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
22833 case Intrinsic::aarch64_sve_cmphs_wide:
22834 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
22835 case Intrinsic::aarch64_sve_cmphi_wide:
22836 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
22837 case Intrinsic::aarch64_sve_cmplo_wide:
22838 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
22839 case Intrinsic::aarch64_sve_cmpls_wide:
22840 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
22841 case Intrinsic::aarch64_sve_ptest_any:
22842 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22844 case Intrinsic::aarch64_sve_ptest_first:
22845 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22847 case Intrinsic::aarch64_sve_ptest_last:
22848 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22850 case Intrinsic::aarch64_sve_whilelo:
22851 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
22852 N->getOperand(1), N->getOperand(2));
22853 case Intrinsic::aarch64_sve_bsl:
22854 case Intrinsic::aarch64_sve_bsl1n:
22855 case Intrinsic::aarch64_sve_bsl2n:
22856 case Intrinsic::aarch64_sve_nbsl:
22857 return combineSVEBitSel(IID, N, DAG);
22858 }
22859 return SDValue();
22860}
22861
22862static bool isCheapToExtend(const SDValue &N) {
22863 unsigned OC = N->getOpcode();
22864 return OC == ISD::LOAD || OC == ISD::MLOAD ||
22866}
22867
22868static SDValue
22870 SelectionDAG &DAG) {
22871 // If we have (sext (setcc A B)) and A and B are cheap to extend,
22872 // we can move the sext into the arguments and have the same result. For
22873 // example, if A and B are both loads, we can make those extending loads and
22874 // avoid an extra instruction. This pattern appears often in VLS code
22875 // generation where the inputs to the setcc have a different size to the
22876 // instruction that wants to use the result of the setcc.
22877 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22878 N->getOperand(0)->getOpcode() == ISD::SETCC);
22879 const SDValue SetCC = N->getOperand(0);
22880
22881 const SDValue CCOp0 = SetCC.getOperand(0);
22882 const SDValue CCOp1 = SetCC.getOperand(1);
22883 if (!CCOp0->getValueType(0).isInteger() ||
22884 !CCOp1->getValueType(0).isInteger())
22885 return SDValue();
22886
22887 ISD::CondCode Code =
22888 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
22889
22890 ISD::NodeType ExtType =
22891 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22892
22893 if (isCheapToExtend(SetCC.getOperand(0)) &&
22894 isCheapToExtend(SetCC.getOperand(1))) {
22895 const SDValue Ext1 =
22896 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
22897 const SDValue Ext2 =
22898 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
22899
22900 return DAG.getSetCC(
22901 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
22902 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
22903 }
22904
22905 return SDValue();
22906}
22907
22908// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22909// This comes from interleaved vectorization. It is performed late to capture
22910// uitofp converts too.
22912 SelectionDAG &DAG) {
22913 EVT VT = N->getValueType(0);
22914 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
22915 N->getOpcode() != ISD::ZERO_EXTEND ||
22916 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22917 return SDValue();
22918
22919 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
22920 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22921 return SDValue();
22922
22923 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
22924 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
22925 if (!Shuffle ||
22926 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
22927 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
22928 return SDValue();
22929
22930 unsigned Idx;
22932 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
22933 // An undef interleave shuffle can come up after other canonicalizations,
22934 // where the shuffle has been converted to
22935 // zext(extract(shuffle b, undef, [u,u,0,4]))
22936 bool IsUndefDeInterleave = false;
22937 if (!IsDeInterleave)
22938 IsUndefDeInterleave =
22939 Shuffle->getOperand(1).isUndef() &&
22940 all_of(
22941 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
22942 [](int M) { return M < 0; }) &&
22944 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
22945 VT.getVectorNumElements() / 2),
22946 4, Idx);
22947 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
22948 return SDValue();
22949 SDLoc DL(N);
22950 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22951 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
22952 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22953 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
22954 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
22955 VT, BC1, BC2);
22956 if ((Idx & 1) == 1)
22957 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
22958 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
22959 return DAG.getNode(
22960 ISD::AND, DL, VT, UZP,
22961 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
22962}
22963
22964// This comes up similar to the above when lowering deinterleaving shuffles from
22965// zexts. We have legalized the operations in the generally case to
22966// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22967// the extract is to the low half and the uzp is uzp1. There would be an extra
22968// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22969// there could also be an existing and / shift that can be combined in, either
22970// before of after the extract.
22972 EVT VT = N->getValueType(0);
22973 if (N->getOpcode() != ISD::ZERO_EXTEND ||
22974 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22975 return SDValue();
22976
22977 SDValue Op = N->getOperand(0);
22978 unsigned ExtOffset = (unsigned)-1;
22979 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22980 ExtOffset = Op.getConstantOperandVal(1);
22981 Op = Op.getOperand(0);
22982 }
22983
22984 unsigned Shift = 0;
22986 Op.getValueType().getScalarSizeInBits());
22987
22988 if (Op.getOpcode() == AArch64ISD::VLSHR) {
22989 Shift = Op.getConstantOperandVal(1);
22990 Op = Op.getOperand(0);
22991 Mask = Mask.lshr(Shift);
22992 }
22993 if (Op.getOpcode() == ISD::AND &&
22994 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
22995 Op = Op.getOperand(0);
22996 Mask = Mask.zext(VT.getScalarSizeInBits());
22997 } else if (Op.getOpcode() == AArch64ISD::BICi) {
22998 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
22999 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
23000 Mask = Mask.zext(VT.getScalarSizeInBits());
23001 Op = Op.getOperand(0);
23002 }
23003
23004 if (ExtOffset == (unsigned)-1) {
23005 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23006 ExtOffset = Op.getConstantOperandVal(1);
23007 Op = Op.getOperand(0);
23008 } else
23009 return SDValue();
23010 }
23011 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23012 return SDValue();
23013
23014 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
23015 return SDValue();
23016 if (Op.getOpcode() == AArch64ISD::UZP2)
23017 Shift += VT.getScalarSizeInBits() / 2;
23018
23019 SDLoc DL(N);
23020 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23021 Op.getOperand(ExtOffset == 0 ? 0 : 1));
23022 if (Shift != 0)
23023 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
23024 DAG.getConstant(Shift, DL, MVT::i32));
23025 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
23026}
23027
23030 SelectionDAG &DAG) {
23031 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
23032 // we can convert that DUP into another extract_high (of a bigger DUP), which
23033 // helps the backend to decide that an sabdl2 would be useful, saving a real
23034 // extract_high operation.
23035 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
23036 N->getOperand(0).getValueType().is64BitVector() &&
23037 (N->getOperand(0).getOpcode() == ISD::ABDU ||
23038 N->getOperand(0).getOpcode() == ISD::ABDS)) {
23039 SDNode *ABDNode = N->getOperand(0).getNode();
23040 SDValue NewABD =
23042 if (!NewABD.getNode())
23043 return SDValue();
23044
23045 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
23046 }
23047
23049 return R;
23050 if (SDValue R = performZExtUZPCombine(N, DAG))
23051 return R;
23052
23053 if (N->getValueType(0).isFixedLengthVector() &&
23054 N->getOpcode() == ISD::SIGN_EXTEND &&
23055 N->getOperand(0)->getOpcode() == ISD::SETCC)
23056 return performSignExtendSetCCCombine(N, DCI, DAG);
23057
23058 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
23059 // that the top half of the result register must be unused, due to the
23060 // any_extend. This means that we can replace this pattern with (rev16
23061 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
23062 // ...)), which is what this pattern would otherwise be lowered to.
23063 // Only apply this optimisation if any_extend in original pattern to i32 or
23064 // i64, because this type will become the input type to REV16 in the new
23065 // pattern, so must be a legitimate REV16 input type.
23066 SDValue Bswap = N->getOperand(0);
23067 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
23068 Bswap.getValueType() == MVT::i16 &&
23069 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
23070 SDLoc DL(N);
23071 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
23072 Bswap->getOperand(0));
23073 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
23074 NewAnyExtend);
23075 }
23076
23077 return SDValue();
23078}
23079
23081 SDValue SplatVal, unsigned NumVecElts) {
23082 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
23083 Align OrigAlignment = St.getAlign();
23084 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
23085
23086 // Create scalar stores. This is at least as good as the code sequence for a
23087 // split unaligned store which is a dup.s, ext.b, and two stores.
23088 // Most of the time the three stores should be replaced by store pair
23089 // instructions (stp).
23090 SDLoc DL(&St);
23091 SDValue BasePtr = St.getBasePtr();
23092 uint64_t BaseOffset = 0;
23093
23094 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
23095 SDValue NewST1 =
23096 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
23097 OrigAlignment, St.getMemOperand()->getFlags());
23098
23099 // As this in ISel, we will not merge this add which may degrade results.
23100 if (BasePtr->getOpcode() == ISD::ADD &&
23101 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
23102 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
23103 BasePtr = BasePtr->getOperand(0);
23104 }
23105
23106 unsigned Offset = EltOffset;
23107 while (--NumVecElts) {
23108 Align Alignment = commonAlignment(OrigAlignment, Offset);
23109 SDValue OffsetPtr =
23110 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23111 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
23112 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
23113 PtrInfo.getWithOffset(Offset), Alignment,
23114 St.getMemOperand()->getFlags());
23115 Offset += EltOffset;
23116 }
23117 return NewST1;
23118}
23119
23120// Returns an SVE type that ContentTy can be trivially sign or zero extended
23121// into.
23122static MVT getSVEContainerType(EVT ContentTy) {
23123 assert(ContentTy.isSimple() && "No SVE containers for extended types");
23124
23125 switch (ContentTy.getSimpleVT().SimpleTy) {
23126 default:
23127 llvm_unreachable("No known SVE container for this MVT type");
23128 case MVT::nxv2i8:
23129 case MVT::nxv2i16:
23130 case MVT::nxv2i32:
23131 case MVT::nxv2i64:
23132 case MVT::nxv2f32:
23133 case MVT::nxv2f64:
23134 return MVT::nxv2i64;
23135 case MVT::nxv4i8:
23136 case MVT::nxv4i16:
23137 case MVT::nxv4i32:
23138 case MVT::nxv4f32:
23139 return MVT::nxv4i32;
23140 case MVT::nxv8i8:
23141 case MVT::nxv8i16:
23142 case MVT::nxv8f16:
23143 case MVT::nxv8bf16:
23144 return MVT::nxv8i16;
23145 case MVT::nxv16i8:
23146 return MVT::nxv16i8;
23147 }
23148}
23149
23151 SDLoc DL(N);
23152 EVT VT = N->getValueType(0);
23153
23155 return SDValue();
23156
23157 EVT ContainerVT = VT;
23158 if (ContainerVT.isInteger())
23159 ContainerVT = getSVEContainerType(ContainerVT);
23160
23161 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23162 SDValue Ops[] = { N->getOperand(0), // Chain
23163 N->getOperand(2), // Pg
23164 N->getOperand(3), // Base
23165 DAG.getValueType(VT) };
23166
23167 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
23168 SDValue LoadChain = SDValue(Load.getNode(), 1);
23169
23170 if (ContainerVT.isInteger() && (VT != ContainerVT))
23171 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
23172
23173 return DAG.getMergeValues({ Load, LoadChain }, DL);
23174}
23175
23177 SDLoc DL(N);
23178 EVT VT = N->getValueType(0);
23179 EVT PtrTy = N->getOperand(3).getValueType();
23180
23181 EVT LoadVT = VT;
23182 if (VT.isFloatingPoint())
23183 LoadVT = VT.changeTypeToInteger();
23184
23185 auto *MINode = cast<MemIntrinsicSDNode>(N);
23186 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
23187 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
23188 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
23189 MINode->getOperand(2), PassThru,
23190 MINode->getMemoryVT(), MINode->getMemOperand(),
23192
23193 if (VT.isFloatingPoint()) {
23194 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
23195 return DAG.getMergeValues(Ops, DL);
23196 }
23197
23198 return L;
23199}
23200
23201template <unsigned Opcode>
23203 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
23204 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
23205 "Unsupported opcode.");
23206 SDLoc DL(N);
23207 EVT VT = N->getValueType(0);
23208
23209 EVT LoadVT = VT;
23210 if (VT.isFloatingPoint())
23211 LoadVT = VT.changeTypeToInteger();
23212
23213 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
23214 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
23215 SDValue LoadChain = SDValue(Load.getNode(), 1);
23216
23217 if (VT.isFloatingPoint())
23218 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
23219
23220 return DAG.getMergeValues({Load, LoadChain}, DL);
23221}
23222
23224 SDLoc DL(N);
23225 SDValue Data = N->getOperand(2);
23226 EVT DataVT = Data.getValueType();
23227 EVT HwSrcVt = getSVEContainerType(DataVT);
23228 SDValue InputVT = DAG.getValueType(DataVT);
23229
23230 if (DataVT.isFloatingPoint())
23231 InputVT = DAG.getValueType(HwSrcVt);
23232
23233 SDValue SrcNew;
23234 if (Data.getValueType().isFloatingPoint())
23235 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
23236 else
23237 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
23238
23239 SDValue Ops[] = { N->getOperand(0), // Chain
23240 SrcNew,
23241 N->getOperand(4), // Base
23242 N->getOperand(3), // Pg
23243 InputVT
23244 };
23245
23246 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
23247}
23248
23250 SDLoc DL(N);
23251
23252 SDValue Data = N->getOperand(2);
23253 EVT DataVT = Data.getValueType();
23254 EVT PtrTy = N->getOperand(4).getValueType();
23255
23256 if (DataVT.isFloatingPoint())
23258
23259 auto *MINode = cast<MemIntrinsicSDNode>(N);
23260 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
23261 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
23262 MINode->getMemoryVT(), MINode->getMemOperand(),
23263 ISD::UNINDEXED, false, false);
23264}
23265
23266/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
23267/// load store optimizer pass will merge them to store pair stores. This should
23268/// be better than a movi to create the vector zero followed by a vector store
23269/// if the zero constant is not re-used, since one instructions and one register
23270/// live range will be removed.
23271///
23272/// For example, the final generated code should be:
23273///
23274/// stp xzr, xzr, [x0]
23275///
23276/// instead of:
23277///
23278/// movi v0.2d, #0
23279/// str q0, [x0]
23280///
23282 SDValue StVal = St.getValue();
23283 EVT VT = StVal.getValueType();
23284
23285 // Avoid scalarizing zero splat stores for scalable vectors.
23286 if (VT.isScalableVector())
23287 return SDValue();
23288
23289 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
23290 // 2, 3 or 4 i32 elements.
23291 int NumVecElts = VT.getVectorNumElements();
23292 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
23293 VT.getVectorElementType().getSizeInBits() == 64) ||
23294 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
23295 VT.getVectorElementType().getSizeInBits() == 32)))
23296 return SDValue();
23297
23298 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
23299 return SDValue();
23300
23301 // If the zero constant has more than one use then the vector store could be
23302 // better since the constant mov will be amortized and stp q instructions
23303 // should be able to be formed.
23304 if (!StVal.hasOneUse())
23305 return SDValue();
23306
23307 // If the store is truncating then it's going down to i16 or smaller, which
23308 // means it can be implemented in a single store anyway.
23309 if (St.isTruncatingStore())
23310 return SDValue();
23311
23312 // If the immediate offset of the address operand is too large for the stp
23313 // instruction, then bail out.
23314 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
23315 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
23316 if (Offset < -512 || Offset > 504)
23317 return SDValue();
23318 }
23319
23320 for (int I = 0; I < NumVecElts; ++I) {
23321 SDValue EltVal = StVal.getOperand(I);
23322 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
23323 return SDValue();
23324 }
23325
23326 // Use a CopyFromReg WZR/XZR here to prevent
23327 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
23328 SDLoc DL(&St);
23329 unsigned ZeroReg;
23330 EVT ZeroVT;
23331 if (VT.getVectorElementType().getSizeInBits() == 32) {
23332 ZeroReg = AArch64::WZR;
23333 ZeroVT = MVT::i32;
23334 } else {
23335 ZeroReg = AArch64::XZR;
23336 ZeroVT = MVT::i64;
23337 }
23338 SDValue SplatVal =
23339 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
23340 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23341}
23342
23343/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
23344/// value. The load store optimizer pass will merge them to store pair stores.
23345/// This has better performance than a splat of the scalar followed by a split
23346/// vector store. Even if the stores are not merged it is four stores vs a dup,
23347/// followed by an ext.b and two stores.
23349 SDValue StVal = St.getValue();
23350 EVT VT = StVal.getValueType();
23351
23352 // Don't replace floating point stores, they possibly won't be transformed to
23353 // stp because of the store pair suppress pass.
23354 if (VT.isFloatingPoint())
23355 return SDValue();
23356
23357 // We can express a splat as store pair(s) for 2 or 4 elements.
23358 unsigned NumVecElts = VT.getVectorNumElements();
23359 if (NumVecElts != 4 && NumVecElts != 2)
23360 return SDValue();
23361
23362 // If the store is truncating then it's going down to i16 or smaller, which
23363 // means it can be implemented in a single store anyway.
23364 if (St.isTruncatingStore())
23365 return SDValue();
23366
23367 // Check that this is a splat.
23368 // Make sure that each of the relevant vector element locations are inserted
23369 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
23370 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
23371 SDValue SplatVal;
23372 for (unsigned I = 0; I < NumVecElts; ++I) {
23373 // Check for insert vector elements.
23374 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
23375 return SDValue();
23376
23377 // Check that same value is inserted at each vector element.
23378 if (I == 0)
23379 SplatVal = StVal.getOperand(1);
23380 else if (StVal.getOperand(1) != SplatVal)
23381 return SDValue();
23382
23383 // Check insert element index.
23384 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
23385 if (!CIndex)
23386 return SDValue();
23387 uint64_t IndexVal = CIndex->getZExtValue();
23388 if (IndexVal >= NumVecElts)
23389 return SDValue();
23390 IndexNotInserted.reset(IndexVal);
23391
23392 StVal = StVal.getOperand(0);
23393 }
23394 // Check that all vector element locations were inserted to.
23395 if (IndexNotInserted.any())
23396 return SDValue();
23397
23398 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23399}
23400
23402 SelectionDAG &DAG,
23403 const AArch64Subtarget *Subtarget) {
23404
23405 StoreSDNode *S = cast<StoreSDNode>(N);
23406 if (S->isVolatile() || S->isIndexed())
23407 return SDValue();
23408
23409 SDValue StVal = S->getValue();
23410 EVT VT = StVal.getValueType();
23411
23412 if (!VT.isFixedLengthVector())
23413 return SDValue();
23414
23415 // If we get a splat of zeros, convert this vector store to a store of
23416 // scalars. They will be merged into store pairs of xzr thereby removing one
23417 // instruction and one register.
23418 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
23419 return ReplacedZeroSplat;
23420
23421 // FIXME: The logic for deciding if an unaligned store should be split should
23422 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
23423 // a call to that function here.
23424
23425 if (!Subtarget->isMisaligned128StoreSlow())
23426 return SDValue();
23427
23428 // Don't split at -Oz.
23430 return SDValue();
23431
23432 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
23433 // those up regresses performance on micro-benchmarks and olden/bh.
23434 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
23435 return SDValue();
23436
23437 // Split unaligned 16B stores. They are terrible for performance.
23438 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
23439 // extensions can use this to mark that it does not want splitting to happen
23440 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
23441 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
23442 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
23443 S->getAlign() <= Align(2))
23444 return SDValue();
23445
23446 // If we get a splat of a scalar convert this vector store to a store of
23447 // scalars. They will be merged into store pairs thereby removing two
23448 // instructions.
23449 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
23450 return ReplacedSplat;
23451
23452 SDLoc DL(S);
23453
23454 // Split VT into two.
23455 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
23456 unsigned NumElts = HalfVT.getVectorNumElements();
23457 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23458 DAG.getConstant(0, DL, MVT::i64));
23459 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23460 DAG.getConstant(NumElts, DL, MVT::i64));
23461 SDValue BasePtr = S->getBasePtr();
23462 SDValue NewST1 =
23463 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
23464 S->getAlign(), S->getMemOperand()->getFlags());
23465 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23466 DAG.getConstant(8, DL, MVT::i64));
23467 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
23468 S->getPointerInfo(), S->getAlign(),
23469 S->getMemOperand()->getFlags());
23470}
23471
23473 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
23474
23475 // splice(pg, op1, undef) -> op1
23476 if (N->getOperand(2).isUndef())
23477 return N->getOperand(1);
23478
23479 return SDValue();
23480}
23481
23483 const AArch64Subtarget *Subtarget) {
23484 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
23485 N->getOpcode() == AArch64ISD::UUNPKLO) &&
23486 "Unexpected Opcode!");
23487
23488 // uunpklo/hi undef -> undef
23489 if (N->getOperand(0).isUndef())
23490 return DAG.getUNDEF(N->getValueType(0));
23491
23492 // If this is a masked load followed by an UUNPKLO, fold this into a masked
23493 // extending load. We can do this even if this is already a masked
23494 // {z,}extload.
23495 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
23496 N->getOpcode() == AArch64ISD::UUNPKLO) {
23497 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
23498 SDValue Mask = MLD->getMask();
23499 SDLoc DL(N);
23500
23501 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
23502 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23503 (MLD->getPassThru()->isUndef() ||
23504 isZerosVector(MLD->getPassThru().getNode()))) {
23505 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23506 unsigned PgPattern = Mask->getConstantOperandVal(0);
23507 EVT VT = N->getValueType(0);
23508
23509 // Ensure we can double the size of the predicate pattern
23510 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23511 if (NumElts &&
23512 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
23513 Mask =
23514 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
23515 SDValue PassThru = DAG.getConstant(0, DL, VT);
23516 SDValue NewLoad = DAG.getMaskedLoad(
23517 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
23518 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
23520
23521 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
23522
23523 return NewLoad;
23524 }
23525 }
23526 }
23527
23528 return SDValue();
23529}
23530
23532 if (N->getOpcode() != AArch64ISD::UZP1)
23533 return false;
23534 SDValue Op0 = N->getOperand(0);
23535 EVT SrcVT = Op0->getValueType(0);
23536 EVT DstVT = N->getValueType(0);
23537 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
23538 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
23539 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23540}
23541
23542// Try to combine rounding shifts where the operands come from an extend, and
23543// the result is truncated and combined into one vector.
23544// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23546 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23547 SDValue Op0 = N->getOperand(0);
23548 SDValue Op1 = N->getOperand(1);
23549 EVT ResVT = N->getValueType(0);
23550
23551 unsigned RshOpc = Op0.getOpcode();
23552 if (RshOpc != AArch64ISD::RSHRNB_I)
23553 return SDValue();
23554
23555 // Same op code and imm value?
23556 SDValue ShiftValue = Op0.getOperand(1);
23557 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
23558 return SDValue();
23559
23560 // Same unextended operand value?
23561 SDValue Lo = Op0.getOperand(0);
23562 SDValue Hi = Op1.getOperand(0);
23563 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23564 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23565 return SDValue();
23566 SDValue OrigArg = Lo.getOperand(0);
23567 if (OrigArg != Hi.getOperand(0))
23568 return SDValue();
23569
23570 SDLoc DL(N);
23571 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
23572 getPredicateForVector(DAG, DL, ResVT), OrigArg,
23573 ShiftValue);
23574}
23575
23576// Try to simplify:
23577// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23578// t2 = nxv8i16 srl(t1, ShiftValue)
23579// to
23580// t1 = nxv8i16 rshrnb(X, shiftvalue).
23581// rshrnb will zero the top half bits of each element. Therefore, this combine
23582// should only be performed when a following instruction with the rshrnb
23583// as an operand does not care about the top half of each element. For example,
23584// a uzp1 or a truncating store.
23586 const AArch64Subtarget *Subtarget) {
23587 EVT VT = Srl->getValueType(0);
23588 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23589 return SDValue();
23590
23591 EVT ResVT;
23592 if (VT == MVT::nxv8i16)
23593 ResVT = MVT::nxv16i8;
23594 else if (VT == MVT::nxv4i32)
23595 ResVT = MVT::nxv8i16;
23596 else if (VT == MVT::nxv2i64)
23597 ResVT = MVT::nxv4i32;
23598 else
23599 return SDValue();
23600
23601 SDLoc DL(Srl);
23602 unsigned ShiftValue;
23603 SDValue RShOperand;
23604 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23605 return SDValue();
23606 SDValue Rshrnb = DAG.getNode(
23607 AArch64ISD::RSHRNB_I, DL, ResVT,
23608 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23609 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23610}
23611
23613 if (V.getOpcode() != AArch64ISD::NVCAST)
23614 return SDValue();
23615
23616 SDValue Op = V.getOperand(0);
23617 if (!Op.getValueType().isVector() ||
23618 V.getValueType().getVectorElementCount() !=
23619 Op.getValueType().getVectorElementCount() * 2)
23620 return SDValue();
23621
23622 return Op;
23623}
23624
23626 const AArch64Subtarget *Subtarget) {
23627 SDLoc DL(N);
23628 SDValue Op0 = N->getOperand(0);
23629 SDValue Op1 = N->getOperand(1);
23630 EVT ResVT = N->getValueType(0);
23631
23632 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23633 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23635 Op0.getOperand(0) == Op1.getOperand(0)) {
23636
23637 SDValue SourceVec = Op0.getOperand(0);
23638 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23639 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23640 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23641 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23642 EVT OpVT = Op0.getOperand(1).getValueType();
23643 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23644 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23645 DAG.getUNDEF(WidenedResVT));
23646 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23647 DAG.getConstant(0, DL, OpVT));
23648 }
23649 }
23650
23651 // Following optimizations only work with uzp1.
23652 if (N->getOpcode() == AArch64ISD::UZP2)
23653 return SDValue();
23654
23655 // uzp1(x, undef) -> concat(truncate(x), undef)
23656 if (Op1.getOpcode() == ISD::UNDEF) {
23657 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23658 switch (ResVT.getSimpleVT().SimpleTy) {
23659 default:
23660 break;
23661 case MVT::v16i8:
23662 BCVT = MVT::v8i16;
23663 HalfVT = MVT::v8i8;
23664 break;
23665 case MVT::v8i16:
23666 BCVT = MVT::v4i32;
23667 HalfVT = MVT::v4i16;
23668 break;
23669 case MVT::v4i32:
23670 BCVT = MVT::v2i64;
23671 HalfVT = MVT::v2i32;
23672 break;
23673 }
23674 if (BCVT != MVT::Other) {
23675 SDValue BC = DAG.getBitcast(BCVT, Op0);
23676 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23677 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23678 DAG.getUNDEF(HalfVT));
23679 }
23680 }
23681
23682 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23683 return Urshr;
23684
23685 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23686 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23687 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23688 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23689 }
23690 }
23691
23692 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23693 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23694 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23695 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23696 }
23697 }
23698
23699 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23700 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23701 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23702 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23703 SDValue X = PreCast.getOperand(0).getOperand(0);
23704 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23705 }
23706 }
23707 }
23708
23709 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23710 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23711 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23712 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23713 SDValue Z = PreCast.getOperand(0).getOperand(1);
23714 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23715 }
23716 }
23717 }
23718
23719 // These optimizations only work on little endian.
23720 if (!DAG.getDataLayout().isLittleEndian())
23721 return SDValue();
23722
23723 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23724 // Example:
23725 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23726 // to
23727 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23729 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23730 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23731 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23732 Op1.getOperand(0));
23733 }
23734 }
23735
23736 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23737 return SDValue();
23738
23739 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23740 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23741
23742 // truncating uzp1(x, y) -> xtn(concat (x, y))
23743 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23744 EVT Op0Ty = SourceOp0.getValueType();
23745 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23746 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23747 SDValue Concat =
23750 SourceOp0, SourceOp1);
23751 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23752 }
23753 }
23754
23755 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23756 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23757 SourceOp1.getOpcode() != ISD::TRUNCATE)
23758 return SDValue();
23759 SourceOp0 = SourceOp0.getOperand(0);
23760 SourceOp1 = SourceOp1.getOperand(0);
23761
23762 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23763 !SourceOp0.getValueType().isSimple())
23764 return SDValue();
23765
23766 EVT ResultTy;
23767
23768 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23769 case MVT::v2i64:
23770 ResultTy = MVT::v4i32;
23771 break;
23772 case MVT::v4i32:
23773 ResultTy = MVT::v8i16;
23774 break;
23775 case MVT::v8i16:
23776 ResultTy = MVT::v16i8;
23777 break;
23778 default:
23779 return SDValue();
23780 }
23781
23782 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23783 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23784 SDValue UzpResult =
23785 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23786
23787 EVT BitcastResultTy;
23788
23789 switch (ResVT.getSimpleVT().SimpleTy) {
23790 case MVT::v2i32:
23791 BitcastResultTy = MVT::v2i64;
23792 break;
23793 case MVT::v4i16:
23794 BitcastResultTy = MVT::v4i32;
23795 break;
23796 case MVT::v8i8:
23797 BitcastResultTy = MVT::v8i16;
23798 break;
23799 default:
23800 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23801 }
23802
23803 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23804 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23805}
23806
23808 unsigned Opc = N->getOpcode();
23809
23810 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23811 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23812 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23813 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23814 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23815 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23816 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23817 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23818
23819 SDLoc DL(N);
23820 SDValue Chain = N->getOperand(0);
23821 SDValue Pg = N->getOperand(1);
23822 SDValue Base = N->getOperand(2);
23823 SDValue Offset = N->getOperand(3);
23824 SDValue Ty = N->getOperand(4);
23825
23826 EVT ResVT = N->getValueType(0);
23827
23828 const auto OffsetOpc = Offset.getOpcode();
23829 const bool OffsetIsZExt =
23830 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
23831 const bool OffsetIsSExt =
23832 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
23833
23834 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23835 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23836 SDValue ExtPg = Offset.getOperand(0);
23837 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
23838 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23839
23840 // If the predicate for the sign- or zero-extended offset is the
23841 // same as the predicate used for this load and the sign-/zero-extension
23842 // was from a 32-bits...
23843 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23844 SDValue UnextendedOffset = Offset.getOperand(1);
23845
23846 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
23847 if (Signed)
23848 NewOpc = getSignExtendedGatherOpcode(NewOpc);
23849
23850 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
23851 {Chain, Pg, Base, UnextendedOffset, Ty});
23852 }
23853 }
23854
23855 return SDValue();
23856}
23857
23858/// Optimize a vector shift instruction and its operand if shifted out
23859/// bits are not used.
23861 const AArch64TargetLowering &TLI,
23863 assert(N->getOpcode() == AArch64ISD::VASHR ||
23864 N->getOpcode() == AArch64ISD::VLSHR);
23865
23866 SDValue Op = N->getOperand(0);
23867 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23868
23869 unsigned ShiftImm = N->getConstantOperandVal(1);
23870 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23871
23872 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23873 if (N->getOpcode() == AArch64ISD::VASHR &&
23874 Op.getOpcode() == AArch64ISD::VSHL &&
23875 N->getOperand(1) == Op.getOperand(1))
23876 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
23877 return Op.getOperand(0);
23878
23879 // If the shift is exact, the shifted out bits matter.
23880 if (N->getFlags().hasExact())
23881 return SDValue();
23882
23883 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
23884 APInt DemandedMask = ~ShiftedOutBits;
23885
23886 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
23887 return SDValue(N, 0);
23888
23889 return SDValue();
23890}
23891
23893 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23894 // This transform works in partnership with performSetCCPunpkCombine to
23895 // remove unnecessary transfer of predicates into standard registers and back
23896 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
23897 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
23898 MVT::i1) {
23899 SDValue CC = N->getOperand(0)->getOperand(0);
23900 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
23901 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
23902 DAG.getVectorIdxConstant(0, SDLoc(N)));
23903 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
23904 }
23905
23906 return SDValue();
23907}
23908
23909/// Target-specific DAG combine function for post-increment LD1 (lane) and
23910/// post-increment LD1R.
23913 bool IsLaneOp) {
23914 if (DCI.isBeforeLegalizeOps())
23915 return SDValue();
23916
23917 SelectionDAG &DAG = DCI.DAG;
23918 EVT VT = N->getValueType(0);
23919
23920 if (!VT.is128BitVector() && !VT.is64BitVector())
23921 return SDValue();
23922
23923 // If it is not LOAD, can not do such combine.
23924 unsigned LoadIdx = IsLaneOp ? 1 : 0;
23925 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
23926 if (!LD)
23927 return SDValue();
23928
23929 // If the Generic combiner already helped form a pre- or post-indexed load,
23930 // skip forming one here.
23931 if (LD->isIndexed())
23932 return SDValue();
23933
23934 // The vector lane must be a constant in the LD1LANE opcode.
23935 SDValue Lane;
23936 if (IsLaneOp) {
23937 Lane = N->getOperand(2);
23938 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
23939 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
23940 return SDValue();
23941 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
23942 return SDValue();
23943 }
23944
23945 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
23946 EVT MemVT = LoadSDN->getMemoryVT();
23947 // Check if memory operand is the same type as the vector element.
23948 if (MemVT != VT.getVectorElementType())
23949 return SDValue();
23950
23951 // Check if there are other uses. If so, do not combine as it will introduce
23952 // an extra load.
23953 for (SDUse &U : LD->uses()) {
23954 if (U.getResNo() == 1) // Ignore uses of the chain result.
23955 continue;
23956 if (U.getUser() != N)
23957 return SDValue();
23958 }
23959
23960 // If there is one use and it can splat the value, prefer that operation.
23961 // TODO: This could be expanded to more operations if they reliably use the
23962 // index variants.
23963 if (N->hasOneUse()) {
23964 unsigned UseOpc = N->user_begin()->getOpcode();
23965 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
23966 return SDValue();
23967 }
23968
23969 SDValue Addr = LD->getOperand(1);
23970 SDValue Vector = N->getOperand(0);
23971 // Search for a use of the address operand that is an increment.
23972 for (SDUse &Use : Addr->uses()) {
23973 SDNode *User = Use.getUser();
23974 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
23975 continue;
23976
23977 // If the increment is a constant, it must match the memory ref size.
23978 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23979 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
23980 uint32_t IncVal = CInc->getZExtValue();
23981 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
23982 if (IncVal != NumBytes)
23983 continue;
23984 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
23985 }
23986
23987 // To avoid cycle construction make sure that neither the load nor the add
23988 // are predecessors to each other or the Vector.
23991 Visited.insert(Addr.getNode());
23992 Worklist.push_back(User);
23993 Worklist.push_back(LD);
23994 Worklist.push_back(Vector.getNode());
23995 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
23996 SDNode::hasPredecessorHelper(User, Visited, Worklist))
23997 continue;
23998
24000 Ops.push_back(LD->getOperand(0)); // Chain
24001 if (IsLaneOp) {
24002 Ops.push_back(Vector); // The vector to be inserted
24003 Ops.push_back(Lane); // The lane to be inserted in the vector
24004 }
24005 Ops.push_back(Addr);
24006 Ops.push_back(Inc);
24007
24008 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
24009 SDVTList SDTys = DAG.getVTList(Tys);
24010 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
24011 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
24012 MemVT,
24013 LoadSDN->getMemOperand());
24014
24015 // Update the uses.
24016 SDValue NewResults[] = {
24017 SDValue(LD, 0), // The result of load
24018 SDValue(UpdN.getNode(), 2) // Chain
24019 };
24020 DCI.CombineTo(LD, NewResults);
24021 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
24022 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
24023
24024 break;
24025 }
24026 return SDValue();
24027}
24028
24029/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
24030/// address translation.
24033 SelectionDAG &DAG) {
24034 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
24035 KnownBits Known;
24037 !DCI.isBeforeLegalizeOps());
24038 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24039 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
24040 DCI.CommitTargetLoweringOpt(TLO);
24041 return true;
24042 }
24043 return false;
24044}
24045
24047 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
24048 "Expected STORE dag node in input!");
24049
24050 if (auto Store = dyn_cast<StoreSDNode>(N)) {
24051 if (!Store->isTruncatingStore() || Store->isIndexed())
24052 return SDValue();
24053 SDValue Ext = Store->getValue();
24054 auto ExtOpCode = Ext.getOpcode();
24055 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
24056 ExtOpCode != ISD::ANY_EXTEND)
24057 return SDValue();
24058 SDValue Orig = Ext->getOperand(0);
24059 if (Store->getMemoryVT() != Orig.getValueType())
24060 return SDValue();
24061 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
24062 Store->getBasePtr(), Store->getMemOperand());
24063 }
24064
24065 return SDValue();
24066}
24067
24068// A custom combine to lower load <3 x i8> as the more efficient sequence
24069// below:
24070// ldrb wX, [x0, #2]
24071// ldrh wY, [x0]
24072// orr wX, wY, wX, lsl #16
24073// fmov s0, wX
24074//
24075// Note that an alternative sequence with even fewer (although usually more
24076// complex/expensive) instructions would be:
24077// ld1r.4h { v0 }, [x0], #2
24078// ld1.b { v0 }[2], [x0]
24079//
24080// Generating this sequence unfortunately results in noticeably worse codegen
24081// for code that extends the loaded v3i8, due to legalization breaking vector
24082// shuffle detection in a way that is very difficult to work around.
24083// TODO: Revisit once v3i8 legalization has been improved in general.
24085 EVT MemVT = LD->getMemoryVT();
24086 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
24087 LD->getBaseAlign() >= 4)
24088 return SDValue();
24089
24090 SDLoc DL(LD);
24092 SDValue Chain = LD->getChain();
24093 SDValue BasePtr = LD->getBasePtr();
24094 MachineMemOperand *MMO = LD->getMemOperand();
24095 assert(LD->getOffset().isUndef() && "undef offset expected");
24096
24097 // Load 2 x i8, then 1 x i8.
24098 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
24099 TypeSize Offset2 = TypeSize::getFixed(2);
24100 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
24101 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
24102 MF.getMachineMemOperand(MMO, 2, 1));
24103
24104 // Extend to i32.
24105 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
24106 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
24107
24108 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
24109 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
24110 DAG.getConstant(16, DL, MVT::i32));
24111 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
24112 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
24113
24114 // Extract v3i8 again.
24115 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
24116 DAG.getConstant(0, DL, MVT::i64));
24117 SDValue TokenFactor = DAG.getNode(
24118 ISD::TokenFactor, DL, MVT::Other,
24119 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
24120 return DAG.getMergeValues({Extract, TokenFactor}, DL);
24121}
24122
24123// Perform TBI simplification if supported by the target and try to break up
24124// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
24125// load instructions can be selected.
24128 SelectionDAG &DAG,
24129 const AArch64Subtarget *Subtarget) {
24130 if (Subtarget->supportsAddressTopByteIgnored())
24131 performTBISimplification(N->getOperand(1), DCI, DAG);
24132
24133 LoadSDNode *LD = cast<LoadSDNode>(N);
24134 EVT RegVT = LD->getValueType(0);
24135 EVT MemVT = LD->getMemoryVT();
24136 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24137 SDLoc DL(LD);
24138
24139 // Cast ptr32 and ptr64 pointers to the default address space before a load.
24140 unsigned AddrSpace = LD->getAddressSpace();
24141 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24142 AddrSpace == ARM64AS::PTR32_UPTR) {
24143 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24144 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
24145 SDValue Cast =
24146 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
24147 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
24148 Cast, LD->getPointerInfo(), MemVT,
24149 LD->getBaseAlign(),
24150 LD->getMemOperand()->getFlags());
24151 }
24152 }
24153
24154 if (LD->isVolatile() || !Subtarget->isLittleEndian())
24155 return SDValue(N, 0);
24156
24157 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24158 return Res;
24159
24160 if (!LD->isNonTemporal())
24161 return SDValue(N, 0);
24162
24163 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
24164 MemVT.getSizeInBits() % 256 == 0 ||
24165 256 % MemVT.getScalarSizeInBits() != 0)
24166 return SDValue(N, 0);
24167
24168 SDValue Chain = LD->getChain();
24169 SDValue BasePtr = LD->getBasePtr();
24170 SDNodeFlags Flags = LD->getFlags();
24172 SmallVector<SDValue, 4> LoadOpsChain;
24173 // Replace any non temporal load over 256-bit with a series of 256 bit loads
24174 // and a scalar/vector load less than 256. This way we can utilize 256-bit
24175 // loads and reduce the amount of load instructions generated.
24176 MVT NewVT =
24178 256 / MemVT.getVectorElementType().getSizeInBits());
24179 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
24180 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
24181 for (unsigned I = 0; I < Num256Loads; I++) {
24182 unsigned PtrOffset = I * 32;
24183 SDValue NewPtr = DAG.getMemBasePlusOffset(
24184 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24185 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24186 SDValue NewLoad = DAG.getLoad(
24187 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
24188 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
24189 LoadOps.push_back(NewLoad);
24190 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
24191 }
24192
24193 // Process remaining bits of the load operation.
24194 // This is done by creating an UNDEF vector to match the size of the
24195 // 256-bit loads and inserting the remaining load to it. We extract the
24196 // original load type at the end using EXTRACT_SUBVECTOR instruction.
24197 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
24198 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
24199 MVT RemainingVT = MVT::getVectorVT(
24201 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
24202 SDValue NewPtr = DAG.getMemBasePlusOffset(
24203 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24204 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24205 SDValue RemainingLoad =
24206 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
24207 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
24208 LD->getMemOperand()->getFlags(), LD->getAAInfo());
24209 SDValue UndefVector = DAG.getUNDEF(NewVT);
24210 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
24211 SDValue ExtendedRemainingLoad =
24212 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
24213 {UndefVector, RemainingLoad, InsertIdx});
24214 LoadOps.push_back(ExtendedRemainingLoad);
24215 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
24216 EVT ConcatVT =
24218 LoadOps.size() * NewVT.getVectorNumElements());
24219 SDValue ConcatVectors =
24220 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
24221 // Extract the original vector type size.
24222 SDValue ExtractSubVector =
24223 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
24224 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
24225 SDValue TokenFactor =
24226 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
24227 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
24228}
24229
24231 EVT VecVT = Op.getValueType();
24232 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
24233 "Need boolean vector type.");
24234
24235 if (Depth > 3)
24237
24238 // We can get the base type from a vector compare or truncate.
24239 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
24240 return Op.getOperand(0).getValueType();
24241
24242 // If an operand is a bool vector, continue looking.
24244 for (SDValue Operand : Op->op_values()) {
24245 if (Operand.getValueType() != VecVT)
24246 continue;
24247
24248 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
24249 if (!BaseVT.isSimple())
24250 BaseVT = OperandVT;
24251 else if (OperandVT != BaseVT)
24253 }
24254
24255 return BaseVT;
24256}
24257
24258// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
24259// iN, we can use a trick that extracts the i^th bit from the i^th element and
24260// then performs a vector add to get a scalar bitmask. This requires that each
24261// element's bits are either all 1 or all 0.
24263 SDLoc DL(N);
24264 SDValue ComparisonResult(N, 0);
24265 EVT VecVT = ComparisonResult.getValueType();
24266 assert(VecVT.isVector() && "Must be a vector type");
24267
24268 unsigned NumElts = VecVT.getVectorNumElements();
24269 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
24270 return SDValue();
24271
24272 if (VecVT.getVectorElementType() != MVT::i1 &&
24273 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
24274 return SDValue();
24275
24276 // If we can find the original types to work on instead of a vector of i1,
24277 // we can avoid extend/extract conversion instructions.
24278 if (VecVT.getVectorElementType() == MVT::i1) {
24279 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
24280 if (!VecVT.isSimple()) {
24281 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
24282 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
24283 }
24284 }
24285 VecVT = VecVT.changeVectorElementTypeToInteger();
24286
24287 // Large vectors don't map directly to this conversion, so to avoid too many
24288 // edge cases, we don't apply it here. The conversion will likely still be
24289 // applied later via multiple smaller vectors, whose results are concatenated.
24290 if (VecVT.getSizeInBits() > 128)
24291 return SDValue();
24292
24293 // Ensure that all elements' bits are either 0s or 1s.
24294 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
24295
24296 SmallVector<SDValue, 16> MaskConstants;
24298 VecVT == MVT::v16i8) {
24299 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
24300 // per entry. We split it into two halves, apply the mask, zip the halves to
24301 // create 8x 16-bit values, and the perform the vector reduce.
24302 for (unsigned Half = 0; Half < 2; ++Half) {
24303 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
24304 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
24305 }
24306 }
24307 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24308 SDValue RepresentativeBits =
24309 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24310
24311 SDValue UpperRepresentativeBits =
24312 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
24313 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
24314 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
24315 RepresentativeBits, UpperRepresentativeBits);
24316 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
24317 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
24318 }
24319
24320 // All other vector sizes.
24321 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
24322 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
24323 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
24324 }
24325
24326 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24327 SDValue RepresentativeBits =
24328 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24329 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
24330 NumElts, VecVT.getVectorElementType().getSizeInBits()));
24331 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
24332}
24333
24335 StoreSDNode *Store) {
24336 if (!Store->isTruncatingStore())
24337 return SDValue();
24338
24339 SDLoc DL(Store);
24340 SDValue VecOp = Store->getValue();
24341 EVT VT = VecOp.getValueType();
24342 EVT MemVT = Store->getMemoryVT();
24343
24344 if (!MemVT.isVector() || !VT.isVector() ||
24345 MemVT.getVectorElementType() != MVT::i1)
24346 return SDValue();
24347
24348 // If we are storing a vector that we are currently building, let
24349 // `scalarizeVectorStore()` handle this more efficiently.
24350 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
24351 return SDValue();
24352
24353 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
24354 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
24355 if (!VectorBits)
24356 return SDValue();
24357
24358 EVT StoreVT =
24360 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
24361 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
24362 Store->getMemOperand());
24363}
24364
24365// Combine store (fp_to_int X) to use vector semantics around the conversion
24366// when NEON is available. This allows us to store the in-vector result directly
24367// without transferring the result into a GPR in the process.
24370 SelectionDAG &DAG,
24371 const AArch64Subtarget *Subtarget) {
24372 // Limit to post-legalization in order to avoid peeling truncating stores.
24373 if (DCI.isBeforeLegalize())
24374 return SDValue();
24375 if (!Subtarget->isNeonAvailable())
24376 return SDValue();
24377 // Source operand is already a vector.
24378 SDValue Value = ST->getValue();
24379 if (Value.getValueType().isVector())
24380 return SDValue();
24381
24382 // Look through potential assertions.
24383 while (Value->isAssert())
24384 Value = Value.getOperand(0);
24385
24386 if (Value.getOpcode() != ISD::FP_TO_SINT &&
24387 Value.getOpcode() != ISD::FP_TO_UINT)
24388 return SDValue();
24389 if (!Value->hasOneUse())
24390 return SDValue();
24391
24392 SDValue FPSrc = Value.getOperand(0);
24393 EVT SrcVT = FPSrc.getValueType();
24394 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24395 return SDValue();
24396
24397 // No support for assignments such as i64 = fp_to_sint i32
24398 EVT VT = Value.getSimpleValueType();
24399 if (VT != SrcVT.changeTypeToInteger())
24400 return SDValue();
24401
24402 // Create a 128-bit element vector to avoid widening. The floating point
24403 // conversion is transformed into a single element conversion via a pattern.
24404 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24405 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24406 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24407 SDLoc DL(ST);
24408 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24409 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24410
24411 SDValue Zero = DAG.getVectorIdxConstant(0, DL);
24412 SDValue Extracted =
24413 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24414
24415 DCI.CombineTo(ST->getValue().getNode(), Extracted);
24416 return SDValue(ST, 0);
24417}
24418
24420 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
24421 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
24422 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
24423}
24424
24425// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
24427 const AArch64Subtarget *Subtarget) {
24428 SDValue Value = ST->getValue();
24429 EVT ValueVT = Value.getValueType();
24430
24431 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
24432 Value.getOpcode() != ISD::TRUNCATE ||
24433 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
24434 return SDValue();
24435
24436 assert(ST->getOffset().isUndef() && "undef offset expected");
24437 SDLoc DL(ST);
24438 auto WideVT = EVT::getVectorVT(
24439 *DAG.getContext(),
24440 Value->getOperand(0).getValueType().getVectorElementType(), 4);
24441 SDValue UndefVector = DAG.getUNDEF(WideVT);
24442 SDValue WideTrunc = DAG.getNode(
24443 ISD::INSERT_SUBVECTOR, DL, WideVT,
24444 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
24445 SDValue Cast = DAG.getNode(
24446 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
24447 WideTrunc);
24448
24450 SDValue Chain = ST->getChain();
24451 MachineMemOperand *MMO = ST->getMemOperand();
24452 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
24453 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24454 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
24455 TypeSize Offset2 = TypeSize::getFixed(2);
24456 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
24457 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
24458
24459 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24460 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
24461 TypeSize Offset1 = TypeSize::getFixed(1);
24462 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
24463 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
24464
24465 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24466 DAG.getConstant(0, DL, MVT::i64));
24467 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
24468 MF.getMachineMemOperand(MMO, 0, 1));
24469 return Chain;
24470}
24471
24472static unsigned getFPSubregForVT(EVT VT) {
24473 assert(VT.isSimple() && "Expected simple VT");
24474 switch (VT.getSimpleVT().SimpleTy) {
24475 case MVT::aarch64mfp8:
24476 return AArch64::bsub;
24477 case MVT::f16:
24478 return AArch64::hsub;
24479 case MVT::f32:
24480 return AArch64::ssub;
24481 case MVT::f64:
24482 return AArch64::dsub;
24483 default:
24484 llvm_unreachable("Unexpected VT!");
24485 }
24486}
24487
24490 SelectionDAG &DAG,
24491 const AArch64Subtarget *Subtarget) {
24492 StoreSDNode *ST = cast<StoreSDNode>(N);
24493 SDValue Chain = ST->getChain();
24494 SDValue Value = ST->getValue();
24495 SDValue Ptr = ST->getBasePtr();
24496 EVT ValueVT = Value.getValueType();
24497 EVT MemVT = ST->getMemoryVT();
24498 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24499 SDLoc DL(ST);
24500
24501 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24502 return Res;
24503
24504 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
24505 EVT EltVT = VT.getVectorElementType();
24506 return EltVT == MVT::f32 || EltVT == MVT::f64;
24507 };
24508
24509 // Cast ptr32 and ptr64 pointers to the default address space before a store.
24510 unsigned AddrSpace = ST->getAddressSpace();
24511 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24512 AddrSpace == ARM64AS::PTR32_UPTR) {
24513 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24514 if (PtrVT != Ptr.getSimpleValueType()) {
24515 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
24516 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
24517 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
24518 ST->getAAInfo());
24519 }
24520 }
24521
24522 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
24523 return Res;
24524
24525 // If this is an FP_ROUND followed by a store, fold this into a truncating
24526 // store. We can do this even if this is already a truncstore.
24527 // We purposefully don't care about legality of the nodes here as we know
24528 // they can be split down into something legal.
24529 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
24530 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
24531 Subtarget->useSVEForFixedLengthVectors() &&
24532 ValueVT.isFixedLengthVector() &&
24533 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
24534 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
24535 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
24536 ST->getMemOperand());
24537
24538 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
24539 return Split;
24540
24541 if (Subtarget->supportsAddressTopByteIgnored() &&
24542 performTBISimplification(N->getOperand(2), DCI, DAG))
24543 return SDValue(N, 0);
24544
24545 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
24546 return Store;
24547
24548 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
24549 return Store;
24550
24551 if (ST->isTruncatingStore() &&
24552 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
24553 if (SDValue Rshrnb =
24554 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
24555 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
24556 MemVT, ST->getMemOperand());
24557 }
24558 }
24559
24560 // This is an integer vector_extract_elt followed by a (possibly truncating)
24561 // store. We may be able to replace this with a store of an FP subregister.
24562 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
24563 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24564
24565 SDValue Vector = Value.getOperand(0);
24566 SDValue ExtIdx = Value.getOperand(1);
24567 EVT VectorVT = Vector.getValueType();
24568 EVT ElemVT = VectorVT.getVectorElementType();
24569
24570 if (!ValueVT.isInteger())
24571 return SDValue();
24572
24573 // Propagate zero constants (applying this fold may miss optimizations).
24575 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
24576 DAG.ReplaceAllUsesWith(Value, ZeroElt);
24577 return SDValue();
24578 }
24579
24580 if (ValueVT != MemVT && !ST->isTruncatingStore())
24581 return SDValue();
24582
24583 // This could generate an additional extract if the index is non-zero and
24584 // the extracted value has multiple uses.
24585 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24586 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
24587 return SDValue();
24588
24589 // These can lower to st1, which is preferable if we're unlikely to fold the
24590 // addressing into the store.
24591 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24592 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24593 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24594 return SDValue();
24595
24596 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24597 // Heuristic: If there are other users of w/x integer scalars extracted
24598 // from this vector that won't fold into the store -- abandon folding.
24599 // Applying this fold may disrupt paired stores.
24600 for (const auto &Use : Vector->uses()) {
24601 if (Use.getResNo() != Vector.getResNo())
24602 continue;
24603 const SDNode *User = Use.getUser();
24604 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24605 (!User->hasOneUse() ||
24606 (*User->user_begin())->getOpcode() != ISD::STORE))
24607 return SDValue();
24608 }
24609 }
24610
24611 SDValue ExtVector = Vector;
24612 if (!ExtCst || !ExtCst->isZero()) {
24613 // Handle extracting from lanes != 0.
24615 Value.getValueType(), Vector, ExtIdx);
24616 SDValue Zero = DAG.getVectorIdxConstant(0, DL);
24617 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
24618 DAG.getUNDEF(VectorVT), Ext, Zero);
24619 }
24620
24621 EVT FPMemVT = MemVT == MVT::i8
24622 ? MVT::aarch64mfp8
24624 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24625 FPMemVT, ExtVector);
24626
24627 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
24628 ST->getMemOperand());
24629 }
24630
24631 return SDValue();
24632}
24633
24636 SelectionDAG &DAG,
24637 const AArch64Subtarget *Subtarget) {
24638 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
24639 SDValue Value = MST->getValue();
24640 SDValue Mask = MST->getMask();
24641 SDLoc DL(N);
24642
24643 // If this is a UZP1 followed by a masked store, fold this into a masked
24644 // truncating store. We can do this even if this is already a masked
24645 // truncstore.
24646 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
24647 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24648 Value.getValueType().isInteger()) {
24649 Value = Value.getOperand(0);
24650 if (Value.getOpcode() == ISD::BITCAST) {
24651 EVT HalfVT =
24652 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
24653 EVT InVT = Value.getOperand(0).getValueType();
24654
24655 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
24656 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24657 unsigned PgPattern = Mask->getConstantOperandVal(0);
24658
24659 // Ensure we can double the size of the predicate pattern
24660 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24661 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24662 MinSVESize) {
24663 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
24664 PgPattern);
24665 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
24666 MST->getBasePtr(), MST->getOffset(), Mask,
24667 MST->getMemoryVT(), MST->getMemOperand(),
24668 MST->getAddressingMode(),
24669 /*IsTruncating=*/true);
24670 }
24671 }
24672 }
24673 }
24674
24675 if (MST->isTruncatingStore()) {
24676 EVT ValueVT = Value->getValueType(0);
24677 EVT MemVT = MST->getMemoryVT();
24678 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
24679 return SDValue();
24680 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
24681 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
24682 MST->getOffset(), MST->getMask(),
24683 MST->getMemoryVT(), MST->getMemOperand(),
24684 MST->getAddressingMode(), true);
24685 }
24686 }
24687
24688 return SDValue();
24689}
24690
24691/// \return true if part of the index was folded into the Base.
24692static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24693 SDLoc DL, SelectionDAG &DAG) {
24694 // This function assumes a vector of i64 indices.
24695 EVT IndexVT = Index.getValueType();
24696 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24697 return false;
24698
24699 // Simplify:
24700 // BasePtr = Ptr
24701 // Index = X + splat(Offset)
24702 // ->
24703 // BasePtr = Ptr + Offset * scale.
24704 // Index = X
24705 if (Index.getOpcode() == ISD::ADD) {
24706 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
24707 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24708 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24709 Index = Index.getOperand(0);
24710 return true;
24711 }
24712 }
24713
24714 // Simplify:
24715 // BasePtr = Ptr
24716 // Index = (X + splat(Offset)) << splat(Shift)
24717 // ->
24718 // BasePtr = Ptr + (Offset << Shift) * scale)
24719 // Index = X << splat(shift)
24720 if (Index.getOpcode() == ISD::SHL &&
24721 Index.getOperand(0).getOpcode() == ISD::ADD) {
24722 SDValue Add = Index.getOperand(0);
24723 SDValue ShiftOp = Index.getOperand(1);
24724 SDValue OffsetOp = Add.getOperand(1);
24725 if (auto Shift = DAG.getSplatValue(ShiftOp))
24726 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
24727 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
24728 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24729 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24730 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
24731 Add.getOperand(0), ShiftOp);
24732 return true;
24733 }
24734 }
24735
24736 return false;
24737}
24738
24739// Analyse the specified address returning true if a more optimal addressing
24740// mode is available. When returning true all parameters are updated to reflect
24741// their recommended values.
24743 SDValue &BasePtr, SDValue &Index,
24744 SelectionDAG &DAG) {
24745 // Try to iteratively fold parts of the index into the base pointer to
24746 // simplify the index as much as possible.
24747 bool Changed = false;
24748 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
24749 Changed = true;
24750
24751 // Only consider element types that are pointer sized as smaller types can
24752 // be easily promoted.
24753 EVT IndexVT = Index.getValueType();
24754 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24755 return Changed;
24756
24757 // Can indices be trivially shrunk?
24758 EVT DataVT = N->getOperand(1).getValueType();
24759 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24760 // will later be re-extended to 64 bits in legalization
24761 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24762 return Changed;
24763 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
24764 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24765 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
24766 return true;
24767 }
24768
24769 // Match:
24770 // Index = step(const)
24771 int64_t Stride = 0;
24772 if (Index.getOpcode() == ISD::STEP_VECTOR) {
24773 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
24774 }
24775 // Match:
24776 // Index = step(const) << shift(const)
24777 else if (Index.getOpcode() == ISD::SHL &&
24778 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
24779 SDValue RHS = Index.getOperand(1);
24780 if (auto *Shift =
24781 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
24782 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
24783 Stride = Step << Shift->getZExtValue();
24784 }
24785 }
24786
24787 // Return early because no supported pattern is found.
24788 if (Stride == 0)
24789 return Changed;
24790
24791 if (Stride < std::numeric_limits<int32_t>::min() ||
24792 Stride > std::numeric_limits<int32_t>::max())
24793 return Changed;
24794
24795 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24796 unsigned MaxVScale =
24798 int64_t LastElementOffset =
24799 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24800
24801 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
24802 LastElementOffset > std::numeric_limits<int32_t>::max())
24803 return Changed;
24804
24805 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24806 // Stride does not scale explicitly by 'Scale', because it happens in
24807 // the gather/scatter addressing mode.
24808 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
24809 return true;
24810}
24811
24814 if (!DCI.isBeforeLegalize())
24815 return SDValue();
24816 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
24817
24818 SDLoc DL(MGS);
24819 SDValue Chain = MGS->getChain();
24820 SDValue Scale = MGS->getScale();
24821 SDValue Index = MGS->getIndex();
24822 SDValue Mask = MGS->getMask();
24823 SDValue BasePtr = MGS->getBasePtr();
24824 ISD::MemIndexType IndexType = MGS->getIndexType();
24825
24826 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
24827 return SDValue();
24828
24829 // Here we catch such cases early and change MGATHER's IndexType to allow
24830 // the use of an Index that's more legalisation friendly.
24831 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
24832 SDValue PassThru = MGT->getPassThru();
24833 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24834 return DAG.getMaskedGather(
24835 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
24836 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
24837 }
24838 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
24839 SDValue Data = MSC->getValue();
24840 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24841 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
24842 DL, Ops, MSC->getMemOperand(), IndexType,
24843 MSC->isTruncatingStore());
24844 }
24845 auto *HG = cast<MaskedHistogramSDNode>(MGS);
24846 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24847 Index, Scale, HG->getIntID()};
24848 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24849 DL, Ops, HG->getMemOperand(), IndexType);
24850}
24851
24852/// Target-specific DAG combine function for NEON load/store intrinsics
24853/// to merge base address updates.
24856 SelectionDAG &DAG) {
24857 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24858 return SDValue();
24859
24860 unsigned AddrOpIdx = N->getNumOperands() - 1;
24861 SDValue Addr = N->getOperand(AddrOpIdx);
24862
24863 // Search for a use of the address operand that is an increment.
24864 for (SDUse &Use : Addr->uses()) {
24865 SDNode *User = Use.getUser();
24866 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24867 continue;
24868
24869 // Check that the add is independent of the load/store. Otherwise, folding
24870 // it would create a cycle.
24873 Visited.insert(Addr.getNode());
24874 Worklist.push_back(N);
24875 Worklist.push_back(User);
24876 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
24877 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24878 continue;
24879
24880 // Find the new opcode for the updating load/store.
24881 bool IsStore = false;
24882 bool IsLaneOp = false;
24883 bool IsDupOp = false;
24884 unsigned NewOpc = 0;
24885 unsigned NumVecs = 0;
24886 unsigned IntNo = N->getConstantOperandVal(1);
24887 switch (IntNo) {
24888 default: llvm_unreachable("unexpected intrinsic for Neon base update");
24889 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
24890 NumVecs = 2; break;
24891 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
24892 NumVecs = 3; break;
24893 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
24894 NumVecs = 4; break;
24895 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
24896 NumVecs = 2; IsStore = true; break;
24897 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
24898 NumVecs = 3; IsStore = true; break;
24899 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
24900 NumVecs = 4; IsStore = true; break;
24901 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
24902 NumVecs = 2; break;
24903 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
24904 NumVecs = 3; break;
24905 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
24906 NumVecs = 4; break;
24907 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
24908 NumVecs = 2; IsStore = true; break;
24909 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
24910 NumVecs = 3; IsStore = true; break;
24911 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
24912 NumVecs = 4; IsStore = true; break;
24913 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
24914 NumVecs = 2; IsDupOp = true; break;
24915 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
24916 NumVecs = 3; IsDupOp = true; break;
24917 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
24918 NumVecs = 4; IsDupOp = true; break;
24919 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
24920 NumVecs = 2; IsLaneOp = true; break;
24921 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
24922 NumVecs = 3; IsLaneOp = true; break;
24923 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
24924 NumVecs = 4; IsLaneOp = true; break;
24925 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
24926 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
24927 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
24928 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
24929 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
24930 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
24931 }
24932
24933 EVT VecTy;
24934 if (IsStore)
24935 VecTy = N->getOperand(2).getValueType();
24936 else
24937 VecTy = N->getValueType(0);
24938
24939 // If the increment is a constant, it must match the memory ref size.
24940 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24941 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24942 uint32_t IncVal = CInc->getZExtValue();
24943 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
24944 if (IsLaneOp || IsDupOp)
24945 NumBytes /= VecTy.getVectorNumElements();
24946 if (IncVal != NumBytes)
24947 continue;
24948 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24949 }
24951 Ops.push_back(N->getOperand(0)); // Incoming chain
24952 // Load lane and store have vector list as input.
24953 if (IsLaneOp || IsStore)
24954 for (unsigned i = 2; i < AddrOpIdx; ++i)
24955 Ops.push_back(N->getOperand(i));
24956 Ops.push_back(Addr); // Base register
24957 Ops.push_back(Inc);
24958
24959 // Return Types.
24960 EVT Tys[6];
24961 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
24962 unsigned n;
24963 for (n = 0; n < NumResultVecs; ++n)
24964 Tys[n] = VecTy;
24965 Tys[n++] = MVT::i64; // Type of write back register
24966 Tys[n] = MVT::Other; // Type of the chain
24967 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
24968
24969 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
24970 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
24971 MemInt->getMemoryVT(),
24972 MemInt->getMemOperand());
24973
24974 // Update the uses.
24975 std::vector<SDValue> NewResults;
24976 for (unsigned i = 0; i < NumResultVecs; ++i) {
24977 NewResults.push_back(SDValue(UpdN.getNode(), i));
24978 }
24979 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
24980 DCI.CombineTo(N, NewResults);
24981 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
24982
24983 break;
24984 }
24985 return SDValue();
24986}
24987
24988// Checks to see if the value is the prescribed width and returns information
24989// about its extension mode.
24990static
24991bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
24992 ExtType = ISD::NON_EXTLOAD;
24993 switch(V.getNode()->getOpcode()) {
24994 default:
24995 return false;
24996 case ISD::LOAD: {
24997 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
24998 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
24999 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
25000 ExtType = LoadNode->getExtensionType();
25001 return true;
25002 }
25003 return false;
25004 }
25005 case ISD::AssertSext: {
25006 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25007 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25008 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25009 ExtType = ISD::SEXTLOAD;
25010 return true;
25011 }
25012 return false;
25013 }
25014 case ISD::AssertZext: {
25015 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25016 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25017 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25018 ExtType = ISD::ZEXTLOAD;
25019 return true;
25020 }
25021 return false;
25022 }
25023 case ISD::Constant:
25024 case ISD::TargetConstant: {
25025 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
25026 1LL << (width - 1);
25027 }
25028 }
25029
25030 return true;
25031}
25032
25033// This function does a whole lot of voodoo to determine if the tests are
25034// equivalent without and with a mask. Essentially what happens is that given a
25035// DAG resembling:
25036//
25037// +-------------+ +-------------+ +-------------+ +-------------+
25038// | Input | | AddConstant | | CompConstant| | CC |
25039// +-------------+ +-------------+ +-------------+ +-------------+
25040// | | | |
25041// V V | +----------+
25042// +-------------+ +----+ | |
25043// | ADD | |0xff| | |
25044// +-------------+ +----+ | |
25045// | | | |
25046// V V | |
25047// +-------------+ | |
25048// | AND | | |
25049// +-------------+ | |
25050// | | |
25051// +-----+ | |
25052// | | |
25053// V V V
25054// +-------------+
25055// | CMP |
25056// +-------------+
25057//
25058// The AND node may be safely removed for some combinations of inputs. In
25059// particular we need to take into account the extension type of the Input,
25060// the exact values of AddConstant, CompConstant, and CC, along with the nominal
25061// width of the input (this can work for any width inputs, the above graph is
25062// specific to 8 bits.
25063//
25064// The specific equations were worked out by generating output tables for each
25065// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
25066// problem was simplified by working with 4 bit inputs, which means we only
25067// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
25068// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
25069// patterns present in both extensions (0,7). For every distinct set of
25070// AddConstant and CompConstants bit patterns we can consider the masked and
25071// unmasked versions to be equivalent if the result of this function is true for
25072// all 16 distinct bit patterns of for the current extension type of Input (w0).
25073//
25074// sub w8, w0, w1
25075// and w10, w8, #0x0f
25076// cmp w8, w2
25077// cset w9, AArch64CC
25078// cmp w10, w2
25079// cset w11, AArch64CC
25080// cmp w9, w11
25081// cset w0, eq
25082// ret
25083//
25084// Since the above function shows when the outputs are equivalent it defines
25085// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
25086// would be expensive to run during compiles. The equations below were written
25087// in a test harness that confirmed they gave equivalent outputs to the above
25088// for all inputs function, so they can be used determine if the removal is
25089// legal instead.
25090//
25091// isEquivalentMaskless() is the code for testing if the AND can be removed
25092// factored out of the DAG recognition as the DAG can take several forms.
25093
25094static bool isEquivalentMaskless(unsigned CC, unsigned width,
25095 ISD::LoadExtType ExtType, int AddConstant,
25096 int CompConstant) {
25097 // By being careful about our equations and only writing the in term
25098 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
25099 // make them generally applicable to all bit widths.
25100 int MaxUInt = (1 << width);
25101
25102 // For the purposes of these comparisons sign extending the type is
25103 // equivalent to zero extending the add and displacing it by half the integer
25104 // width. Provided we are careful and make sure our equations are valid over
25105 // the whole range we can just adjust the input and avoid writing equations
25106 // for sign extended inputs.
25107 if (ExtType == ISD::SEXTLOAD)
25108 AddConstant -= (1 << (width-1));
25109
25110 switch(CC) {
25111 case AArch64CC::LE:
25112 case AArch64CC::GT:
25113 if ((AddConstant == 0) ||
25114 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
25115 (AddConstant >= 0 && CompConstant < 0) ||
25116 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
25117 return true;
25118 break;
25119 case AArch64CC::LT:
25120 case AArch64CC::GE:
25121 if ((AddConstant == 0) ||
25122 (AddConstant >= 0 && CompConstant <= 0) ||
25123 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
25124 return true;
25125 break;
25126 case AArch64CC::HI:
25127 case AArch64CC::LS:
25128 if ((AddConstant >= 0 && CompConstant < 0) ||
25129 (AddConstant <= 0 && CompConstant >= -1 &&
25130 CompConstant < AddConstant + MaxUInt))
25131 return true;
25132 break;
25133 case AArch64CC::PL:
25134 case AArch64CC::MI:
25135 if ((AddConstant == 0) ||
25136 (AddConstant > 0 && CompConstant <= 0) ||
25137 (AddConstant < 0 && CompConstant <= AddConstant))
25138 return true;
25139 break;
25140 case AArch64CC::LO:
25141 case AArch64CC::HS:
25142 if ((AddConstant >= 0 && CompConstant <= 0) ||
25143 (AddConstant <= 0 && CompConstant >= 0 &&
25144 CompConstant <= AddConstant + MaxUInt))
25145 return true;
25146 break;
25147 case AArch64CC::EQ:
25148 case AArch64CC::NE:
25149 if ((AddConstant > 0 && CompConstant < 0) ||
25150 (AddConstant < 0 && CompConstant >= 0 &&
25151 CompConstant < AddConstant + MaxUInt) ||
25152 (AddConstant >= 0 && CompConstant >= 0 &&
25153 CompConstant >= AddConstant) ||
25154 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
25155 return true;
25156 break;
25157 case AArch64CC::VS:
25158 case AArch64CC::VC:
25159 case AArch64CC::AL:
25160 case AArch64CC::NV:
25161 return true;
25162 case AArch64CC::Invalid:
25163 break;
25164 }
25165
25166 return false;
25167}
25168
25169// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
25170// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
25172 SDNode *AndNode, SelectionDAG &DAG,
25173 unsigned CCIndex, unsigned CmpIndex,
25174 unsigned CC) {
25175 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
25176 if (!SubsC)
25177 return SDValue();
25178
25179 APInt SubsAP = SubsC->getAPIntValue();
25180 if (CC == AArch64CC::HI) {
25181 if (!SubsAP.isMask())
25182 return SDValue();
25183 } else if (CC == AArch64CC::LO) {
25184 if (!SubsAP.isPowerOf2())
25185 return SDValue();
25186 } else
25187 return SDValue();
25188
25189 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
25190 if (!AndC)
25191 return SDValue();
25192
25193 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
25194
25195 SDLoc DL(N);
25196 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
25197 SDValue ANDS = DAG.getNode(
25198 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
25199 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
25200 SDValue AArch64_CC =
25202 N->getOperand(CCIndex)->getValueType(0));
25203
25204 // For now, only performCSELCombine and performBRCONDCombine call this
25205 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
25206 // operands. So just init the ops direct to simplify the code. If we have some
25207 // other case with different CCIndex, CmpIndex, we need to use for loop to
25208 // rewrite the code here.
25209 // TODO: Do we need to assert number of operand is 4 here?
25210 assert((CCIndex == 2 && CmpIndex == 3) &&
25211 "Expected CCIndex to be 2 and CmpIndex to be 3.");
25212 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
25213 ANDS.getValue(1)};
25214 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
25215}
25216
25217static
25220 SelectionDAG &DAG, unsigned CCIndex,
25221 unsigned CmpIndex) {
25222 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
25223 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
25224 unsigned CondOpcode = SubsNode->getOpcode();
25225
25226 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
25227 !SubsNode->hasOneUse())
25228 return SDValue();
25229
25230 // There is a SUBS feeding this condition. Is it fed by a mask we can
25231 // use?
25232
25233 SDNode *AndNode = SubsNode->getOperand(0).getNode();
25234 unsigned MaskBits = 0;
25235
25236 if (AndNode->getOpcode() != ISD::AND)
25237 return SDValue();
25238
25239 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
25240 CmpIndex, CC))
25241 return Val;
25242
25243 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
25244 uint32_t CNV = CN->getZExtValue();
25245 if (CNV == 255)
25246 MaskBits = 8;
25247 else if (CNV == 65535)
25248 MaskBits = 16;
25249 }
25250
25251 if (!MaskBits)
25252 return SDValue();
25253
25254 SDValue AddValue = AndNode->getOperand(0);
25255
25256 if (AddValue.getOpcode() != ISD::ADD)
25257 return SDValue();
25258
25259 // The basic dag structure is correct, grab the inputs and validate them.
25260
25261 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
25262 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
25263 SDValue SubsInputValue = SubsNode->getOperand(1);
25264
25265 // The mask is present and the provenance of all the values is a smaller type,
25266 // lets see if the mask is superfluous.
25267
25268 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
25269 !isa<ConstantSDNode>(SubsInputValue.getNode()))
25270 return SDValue();
25271
25272 ISD::LoadExtType ExtType;
25273
25274 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
25275 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
25276 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
25277 return SDValue();
25278
25279 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
25280 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
25281 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
25282 return SDValue();
25283
25284 // The AND is not necessary, remove it.
25285
25286 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
25287 SubsNode->getValueType(1));
25288 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
25289
25290 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
25291 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
25292
25293 return SDValue(N, 0);
25294}
25295
25296// Optimize compare with zero and branch.
25299 SelectionDAG &DAG) {
25301 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
25302 // will not be produced, as they are conditional branch instructions that do
25303 // not set flags.
25304 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
25305 return SDValue();
25306
25307 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
25308 N = NV.getNode();
25309 SDValue Chain = N->getOperand(0);
25310 SDValue Dest = N->getOperand(1);
25311 SDValue CCVal = N->getOperand(2);
25312 SDValue Cmp = N->getOperand(3);
25313
25314 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
25315 unsigned CC = CCVal->getAsZExtVal();
25316 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
25317 return SDValue();
25318
25319 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
25320 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
25321 SDValue CSel = Cmp.getOperand(0);
25322 auto CSelCC = getCSETCondCode(CSel);
25323 if (CSelCC) {
25324 SDLoc DL(N);
25325 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
25326 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
25327 CSel.getOperand(3));
25328 }
25329 }
25330
25331 unsigned CmpOpc = Cmp.getOpcode();
25332 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
25333 return SDValue();
25334
25335 // Only attempt folding if there is only one use of the flag and no use of the
25336 // value.
25337 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
25338 return SDValue();
25339
25340 SDValue LHS = Cmp.getOperand(0);
25341 SDValue RHS = Cmp.getOperand(1);
25342
25343 assert(LHS.getValueType() == RHS.getValueType() &&
25344 "Expected the value type to be the same for both operands!");
25345 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
25346 return SDValue();
25347
25348 if (isNullConstant(LHS))
25349 std::swap(LHS, RHS);
25350
25351 if (!isNullConstant(RHS))
25352 return SDValue();
25353
25354 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
25355 LHS.getOpcode() == ISD::SRL)
25356 return SDValue();
25357
25358 // Fold the compare into the branch instruction.
25359 SDValue BR;
25360 if (CC == AArch64CC::EQ)
25361 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25362 else
25363 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25364
25365 // Do not add new nodes to DAG combiner worklist.
25366 DCI.CombineTo(N, BR, false);
25367
25368 return SDValue();
25369}
25370
25372 unsigned CC = N->getConstantOperandVal(2);
25373 SDValue SUBS = N->getOperand(3);
25374 SDValue Zero, CTTZ;
25375
25376 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
25377 Zero = N->getOperand(0);
25378 CTTZ = N->getOperand(1);
25379 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
25380 Zero = N->getOperand(1);
25381 CTTZ = N->getOperand(0);
25382 } else
25383 return SDValue();
25384
25385 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
25386 (CTTZ.getOpcode() == ISD::TRUNCATE &&
25387 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
25388 return SDValue();
25389
25390 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
25391 "Illegal type in CTTZ folding");
25392
25393 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
25394 return SDValue();
25395
25396 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
25397 ? CTTZ.getOperand(0).getOperand(0)
25398 : CTTZ.getOperand(0);
25399
25400 if (X != SUBS.getOperand(0))
25401 return SDValue();
25402
25403 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
25404 ? CTTZ.getOperand(0).getValueSizeInBits()
25405 : CTTZ.getValueSizeInBits();
25406 SDValue BitWidthMinusOne =
25407 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
25408 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
25409 BitWidthMinusOne);
25410}
25411
25412// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
25413// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
25414// Where x and y are constants and x != y
25415
25416// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
25417// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
25418// Where x and y are constants and x != y
25420 SDValue L = Op->getOperand(0);
25421 SDValue R = Op->getOperand(1);
25422 AArch64CC::CondCode OpCC =
25423 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25424
25425 SDValue OpCmp = Op->getOperand(3);
25426 if (!isCMP(OpCmp))
25427 return SDValue();
25428
25429 SDValue CmpLHS = OpCmp.getOperand(0);
25430 SDValue CmpRHS = OpCmp.getOperand(1);
25431
25432 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
25433 std::swap(CmpLHS, CmpRHS);
25434 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
25435 return SDValue();
25436
25437 SDValue X = CmpLHS->getOperand(0);
25438 SDValue Y = CmpLHS->getOperand(1);
25439 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
25440 return SDValue();
25441 }
25442
25443 // If one of the constant is opaque constant, x,y sdnode is still different
25444 // but the real value maybe the same. So check APInt here to make sure the
25445 // code is correct.
25446 ConstantSDNode *CX = cast<ConstantSDNode>(X);
25447 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
25448 if (CX->getAPIntValue() == CY->getAPIntValue())
25449 return SDValue();
25450
25452 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
25453 SDValue Cond = CmpLHS->getOperand(3);
25454
25455 if (CmpRHS == Y)
25457 else if (CmpRHS != X)
25458 return SDValue();
25459
25460 if (OpCC == AArch64CC::NE)
25462 else if (OpCC != AArch64CC::EQ)
25463 return SDValue();
25464
25465 SDLoc DL(Op);
25466 EVT VT = Op->getValueType(0);
25467
25468 SDValue CCValue = getCondCode(DAG, CC);
25469 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
25470}
25471
25472// Reassociate the true/false expressions of a CSEL instruction to obtain a
25473// common subexpression with the comparison instruction. For example, change
25474// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
25475// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
25476// subexpression.
25478 SDValue SubsNode = N->getOperand(3);
25479 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
25480 return SDValue();
25481
25482 SDValue CmpOpToMatch = SubsNode.getOperand(1);
25483 SDValue CmpOpOther = SubsNode.getOperand(0);
25484 EVT VT = N->getValueType(0);
25485
25486 unsigned ExpectedOpcode;
25487 SDValue ExpectedOp;
25488 SDValue SubsOp;
25489 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
25490 if (CmpOpConst) {
25491 ExpectedOpcode = ISD::ADD;
25492 ExpectedOp =
25493 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25494 CmpOpConst->getValueType(0));
25495 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25496 CmpOpConst->getValueType(0));
25497 } else {
25498 ExpectedOpcode = ISD::SUB;
25499 ExpectedOp = CmpOpToMatch;
25500 SubsOp = CmpOpToMatch;
25501 }
25502
25503 // Get the operand that can be reassociated with the SUBS instruction.
25504 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
25505 if (Op.getOpcode() != ExpectedOpcode)
25506 return SDValue();
25507 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
25508 !Op.getOperand(0).hasOneUse())
25509 return SDValue();
25510 SDValue X = Op.getOperand(0).getOperand(0);
25511 SDValue Y = Op.getOperand(0).getOperand(1);
25512 if (X != CmpOpOther)
25513 std::swap(X, Y);
25514 if (X != CmpOpOther)
25515 return SDValue();
25516 if (ExpectedOp != Op.getOperand(1))
25517 return SDValue();
25518 return Y;
25519 };
25520
25521 // Try the reassociation using the given constant and condition code.
25522 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
25523 SDValue SubsOp) {
25524 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
25525 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
25526 if (!TReassocOp && !FReassocOp)
25527 return SDValue();
25528
25529 SDValue NewCmp =
25530 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
25531 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
25532
25533 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
25534 if (!ReassocOp)
25535 return N->getOperand(OpNum);
25536 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
25537 NewCmp.getValue(0), ReassocOp);
25538 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
25539 return Res;
25540 };
25541
25542 SDValue TValReassoc = Reassociate(TReassocOp, 0);
25543 SDValue FValReassoc = Reassociate(FReassocOp, 1);
25544 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
25545 getCondCode(DAG, NewCC), NewCmp.getValue(1));
25546 };
25547
25548 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25549
25550 // First, try to eliminate the compare instruction by searching for a
25551 // subtraction with the same constant.
25552 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
25553 return R;
25554
25555 if (!CmpOpConst) {
25556 // Try again with the operands of the SUBS instruction and the condition
25557 // swapped. Due to canonicalization, this only helps for non-constant
25558 // operands of the SUBS instruction.
25559 std::swap(CmpOpToMatch, CmpOpOther);
25560 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
25561 return R;
25562 return SDValue();
25563 }
25564
25565 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
25566 return SDValue();
25567
25568 // Next, search for a subtraction with a slightly different constant. By
25569 // adjusting the condition code, we can still eliminate the compare
25570 // instruction. Adjusting the constant is only valid if it does not result
25571 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
25572 // Since such comparisons are trivially true/false, we should not encounter
25573 // them here but check for them nevertheless to be on the safe side.
25574 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
25575 AArch64CC::CondCode NewCC) {
25576 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
25577 CmpOpConst->getValueType(0));
25578 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
25579 CmpOpConst->getValueType(0));
25580 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
25581 };
25582 switch (CC) {
25583 case AArch64CC::EQ:
25584 case AArch64CC::LS:
25585 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25586 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
25587 case AArch64CC::NE:
25588 case AArch64CC::HI:
25589 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25590 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
25591 case AArch64CC::LO:
25592 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25593 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
25594 case AArch64CC::HS:
25595 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25596 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
25597 case AArch64CC::LT:
25598 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25599 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
25600 case AArch64CC::LE:
25601 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25602 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
25603 case AArch64CC::GT:
25604 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25605 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
25606 case AArch64CC::GE:
25607 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25608 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
25609 default:
25610 return SDValue();
25611 }
25612}
25613
25615 AArch64CC::CondCode OpCC =
25616 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25617
25618 if (OpCC != AArch64CC::NE)
25619 return SDValue();
25620
25621 SDValue PTest = Op->getOperand(3);
25622 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25623 return SDValue();
25624
25625 SDValue TruePred = PTest.getOperand(0);
25626 SDValue AnyPred = PTest.getOperand(1);
25627
25628 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25629 TruePred = TruePred.getOperand(0);
25630
25631 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25632 AnyPred = AnyPred.getOperand(0);
25633
25634 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
25635 return SDValue();
25636
25637 SDValue LastB = Op->getOperand(0);
25638 SDValue Default = Op->getOperand(1);
25639
25640 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
25641 return SDValue();
25642
25643 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
25644 AnyPred, Default, LastB.getOperand(1));
25645}
25646
25647// Optimize CSEL instructions
25650 SelectionDAG &DAG) {
25651 // CSEL x, x, cc -> x
25652 if (N->getOperand(0) == N->getOperand(1))
25653 return N->getOperand(0);
25654
25655 if (SDValue R = foldCSELOfCSEL(N, DAG))
25656 return R;
25657
25658 // Try to reassociate the true/false expressions so that we can do CSE with
25659 // a SUBS instruction used to perform the comparison.
25661 return R;
25662
25663 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25664 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25665 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25666 return Folded;
25667
25668 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25669 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25670 SDValue Cond = N->getOperand(3);
25671 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25672 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
25673 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25674 {Cond.getOperand(1), Cond.getOperand(0)}) &&
25675 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25676 {Cond.getOperand(0), Cond.getOperand(1)}) &&
25677 !isNullConstant(Cond.getOperand(1))) {
25678 AArch64CC::CondCode OldCond =
25679 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25680 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
25681 if (NewCond != AArch64CC::AL) {
25682 SDLoc DL(N);
25683 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25684 Cond.getOperand(1), Cond.getOperand(0));
25685 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
25686 N->getOperand(1), getCondCode(DAG, NewCond),
25687 Sub.getValue(1));
25688 }
25689 }
25690
25691 // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
25692 // use overflow flags, to avoid the comparison with zero. In case of success,
25693 // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
25694 // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
25695 // nodes with their SUBS equivalent as is already done for other flag-setting
25696 // operators, in which case doing the replacement here becomes redundant.
25697 if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
25698 isNullConstant(Cond.getOperand(1))) {
25699 SDValue Sub = Cond.getOperand(0);
25701 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25702 if (Sub.getOpcode() == ISD::SUB &&
25703 (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
25704 CC == AArch64CC::PL)) {
25705 SDLoc DL(N);
25706 SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25707 Sub.getOperand(0), Sub.getOperand(1));
25708 DCI.CombineTo(Sub.getNode(), Subs);
25709 DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
25710 return SDValue(N, 0);
25711 }
25712 }
25713
25714 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
25715 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
25716 return CondLast;
25717
25718 return performCONDCombine(N, DCI, DAG, 2, 3);
25719}
25720
25721// Try to re-use an already extended operand of a vector SetCC feeding a
25722// extended select. Doing so avoids requiring another full extension of the
25723// SET_CC result when lowering the select.
25725 EVT Op0MVT = Op->getOperand(0).getValueType();
25726 if (!Op0MVT.isVector() || Op->use_empty())
25727 return SDValue();
25728
25729 // Make sure that all uses of Op are VSELECTs with result matching types where
25730 // the result type has a larger element type than the SetCC operand.
25731 SDNode *FirstUse = *Op->user_begin();
25732 if (FirstUse->getOpcode() != ISD::VSELECT)
25733 return SDValue();
25734 EVT UseMVT = FirstUse->getValueType(0);
25735 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
25736 return SDValue();
25737 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
25738 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
25739 }))
25740 return SDValue();
25741
25742 APInt V;
25743 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
25744 return SDValue();
25745
25746 SDLoc DL(Op);
25747 SDValue Op0ExtV;
25748 SDValue Op1ExtV;
25749 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
25750 // Check if the first operand of the SET_CC is already extended. If it is,
25751 // split the SET_CC and re-use the extended version of the operand.
25752 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
25753 Op->getOperand(0));
25754 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
25755 Op->getOperand(0));
25756 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25757 Op0ExtV = SDValue(Op0SExt, 0);
25758 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
25759 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25760 Op0ExtV = SDValue(Op0ZExt, 0);
25761 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
25762 } else
25763 return SDValue();
25764
25765 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
25766 Op0ExtV, Op1ExtV, Op->getOperand(2));
25767}
25768
25769static SDValue
25771 SelectionDAG &DAG) {
25772 SDValue Vec = N->getOperand(0);
25773 if (DCI.isBeforeLegalize() &&
25774 Vec.getValueType().getVectorElementType() == MVT::i1 &&
25777 SDLoc DL(N);
25778 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
25779 DAG);
25780 }
25781
25782 return SDValue();
25783}
25784
25787 SelectionDAG &DAG) {
25788 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
25789 SDValue LHS = N->getOperand(0);
25790 SDValue RHS = N->getOperand(1);
25791 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
25792 SDLoc DL(N);
25793 EVT VT = N->getValueType(0);
25794
25795 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
25796 return V;
25797
25798 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
25799 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
25800 LHS->getOpcode() == AArch64ISD::CSEL &&
25801 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
25802 LHS->hasOneUse()) {
25803 // Invert CSEL's condition.
25804 auto OldCond =
25805 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
25806 auto NewCond = getInvertedCondCode(OldCond);
25807
25808 // csel 0, 1, !cond, X
25809 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
25810 LHS.getOperand(0), LHS.getOperand(1),
25811 getCondCode(DAG, NewCond), LHS.getOperand(3));
25812 return DAG.getZExtOrTrunc(CSEL, DL, VT);
25813 }
25814
25815 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
25816 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
25817 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
25818 LHS->hasOneUse()) {
25819 EVT TstVT = LHS->getValueType(0);
25820 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
25821 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
25822 // this pattern will get better opt in emitComparison
25823 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
25824 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
25825 DAG.getSignedConstant(TstImm, DL, TstVT));
25826 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
25827 }
25828 }
25829
25830 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
25831 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
25832 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
25833 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
25834 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25835 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
25837 LHS->getOpcode() == ISD::BITCAST) {
25838 EVT ToVT = LHS->getValueType(0);
25839 EVT FromVT = LHS->getOperand(0).getValueType();
25840 if (FromVT.isFixedLengthVector() &&
25841 FromVT.getVectorElementType() == MVT::i1) {
25842 bool IsNull = isNullConstant(RHS);
25844 DL, MVT::i1, LHS->getOperand(0));
25845 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
25846 LHS);
25847 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25848 }
25849 }
25850
25851 // Try to perform the memcmp when the result is tested for [in]equality with 0
25852 if (SDValue V = performOrXorChainCombine(N, DAG))
25853 return V;
25854
25855 EVT CmpVT = LHS.getValueType();
25856
25857 // NOTE: This exists as a combine only because it proved too awkward to match
25858 // splat(1) across all the NEON types during isel.
25859 APInt SplatLHSVal;
25860 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
25861 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
25862 SplatLHSVal.isOne())
25863 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
25864
25865 return SDValue();
25866}
25867
25868// Replace a flag-setting operator (eg ANDS) with the generic version
25869// (eg AND) if the flag is unused.
25872 unsigned GenericOpcode) {
25873 SDLoc DL(N);
25874 SDValue LHS = N->getOperand(0);
25875 SDValue RHS = N->getOperand(1);
25876 EVT VT = N->getValueType(0);
25877
25878 // If the flag result isn't used, convert back to a generic opcode.
25879 if (!N->hasAnyUseOfValue(1)) {
25880 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
25881 return DCI.CombineTo(N, Res, SDValue(N, 1));
25882 }
25883
25884 // Combine identical generic nodes into this node, re-using the result.
25885 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
25886 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
25887 DCI.CombineTo(Generic, SDValue(N, 0));
25888
25889 return SDValue();
25890}
25891
25893 // setcc_merge_zero pred
25894 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
25895 // => extract_subvector (inner setcc_merge_zero)
25896 SDValue Pred = N->getOperand(0);
25897 SDValue LHS = N->getOperand(1);
25898 SDValue RHS = N->getOperand(2);
25899 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25900
25901 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
25902 LHS->getOpcode() != ISD::SIGN_EXTEND)
25903 return SDValue();
25904
25905 SDValue Extract = LHS->getOperand(0);
25906 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25907 Extract->getValueType(0) != N->getValueType(0) ||
25908 Extract->getConstantOperandVal(1) != 0)
25909 return SDValue();
25910
25911 SDValue InnerSetCC = Extract->getOperand(0);
25912 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
25913 return SDValue();
25914
25915 // By this point we've effectively got
25916 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
25917 // lanes are already zero then the trunc(sext()) sequence is redundant and we
25918 // can operate on A directly.
25919 SDValue InnerPred = InnerSetCC.getOperand(0);
25920 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
25921 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
25922 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
25923 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
25924 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
25925 return Extract;
25926
25927 return SDValue();
25928}
25929
25930static SDValue
25932 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25933 "Unexpected opcode!");
25934
25935 SelectionDAG &DAG = DCI.DAG;
25936 SDValue Pred = N->getOperand(0);
25937 SDValue LHS = N->getOperand(1);
25938 SDValue RHS = N->getOperand(2);
25939 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25940
25941 if (SDValue V = performSetCCPunpkCombine(N, DAG))
25942 return V;
25943
25944 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
25945 LHS->getOpcode() == ISD::SIGN_EXTEND &&
25946 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
25947 // setcc_merge_zero(
25948 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
25949 // => setcc_merge_zero(pred, ...)
25950 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25951 LHS->getOperand(0)->getOperand(0) == Pred)
25952 return LHS->getOperand(0);
25953
25954 // setcc_merge_zero(
25955 // all_active, extend(nxvNi1 ...), != splat(0))
25956 // -> nxvNi1 ...
25957 if (isAllActivePredicate(DAG, Pred))
25958 return LHS->getOperand(0);
25959
25960 // setcc_merge_zero(
25961 // pred, extend(nxvNi1 ...), != splat(0))
25962 // -> nxvNi1 and(pred, ...)
25963 if (DCI.isAfterLegalizeDAG())
25964 // Do this after legalization to allow more folds on setcc_merge_zero
25965 // to be recognized.
25966 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
25967 LHS->getOperand(0), Pred);
25968 }
25969
25970 return SDValue();
25971}
25972
25973// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
25974// as well as whether the test should be inverted. This code is required to
25975// catch these cases (as opposed to standard dag combines) because
25976// AArch64ISD::TBZ is matched during legalization.
25977static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
25978 SelectionDAG &DAG) {
25979
25980 if (!Op->hasOneUse())
25981 return Op;
25982
25983 // We don't handle undef/constant-fold cases below, as they should have
25984 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
25985 // etc.)
25986
25987 // (tbz (trunc x), b) -> (tbz x, b)
25988 // This case is just here to enable more of the below cases to be caught.
25989 if (Op->getOpcode() == ISD::TRUNCATE &&
25990 Bit < Op->getValueType(0).getSizeInBits()) {
25991 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25992 }
25993
25994 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
25995 if (Op->getOpcode() == ISD::ANY_EXTEND &&
25996 Bit < Op->getOperand(0).getValueSizeInBits()) {
25997 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25998 }
25999
26000 if (Op->getNumOperands() != 2)
26001 return Op;
26002
26003 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
26004 if (!C)
26005 return Op;
26006
26007 switch (Op->getOpcode()) {
26008 default:
26009 return Op;
26010
26011 // (tbz (and x, m), b) -> (tbz x, b)
26012 case ISD::AND:
26013 if ((C->getZExtValue() >> Bit) & 1)
26014 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26015 return Op;
26016
26017 // (tbz (shl x, c), b) -> (tbz x, b-c)
26018 case ISD::SHL:
26019 if (C->getZExtValue() <= Bit &&
26020 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26021 Bit = Bit - C->getZExtValue();
26022 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26023 }
26024 return Op;
26025
26026 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
26027 case ISD::SRA:
26028 Bit = Bit + C->getZExtValue();
26029 if (Bit >= Op->getValueType(0).getSizeInBits())
26030 Bit = Op->getValueType(0).getSizeInBits() - 1;
26031 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26032
26033 // (tbz (srl x, c), b) -> (tbz x, b+c)
26034 case ISD::SRL:
26035 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26036 Bit = Bit + C->getZExtValue();
26037 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26038 }
26039 return Op;
26040
26041 // (tbz (xor x, -1), b) -> (tbnz x, b)
26042 case ISD::XOR:
26043 if ((C->getZExtValue() >> Bit) & 1)
26044 Invert = !Invert;
26045 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26046 }
26047}
26048
26049// Optimize test single bit zero/non-zero and branch.
26052 SelectionDAG &DAG) {
26053 unsigned Bit = N->getConstantOperandVal(2);
26054 bool Invert = false;
26055 SDValue TestSrc = N->getOperand(1);
26056 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
26057
26058 if (TestSrc == NewTestSrc)
26059 return SDValue();
26060
26061 unsigned NewOpc = N->getOpcode();
26062 if (Invert) {
26063 if (NewOpc == AArch64ISD::TBZ)
26064 NewOpc = AArch64ISD::TBNZ;
26065 else {
26066 assert(NewOpc == AArch64ISD::TBNZ);
26067 NewOpc = AArch64ISD::TBZ;
26068 }
26069 }
26070
26071 SDLoc DL(N);
26072 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
26073 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
26074}
26075
26076// Swap vselect operands where it may allow a predicated operation to achieve
26077// the `sel`.
26078//
26079// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
26080// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
26082 auto SelectA = N->getOperand(1);
26083 auto SelectB = N->getOperand(2);
26084 auto NTy = N->getValueType(0);
26085
26086 if (!NTy.isScalableVector())
26087 return SDValue();
26088 SDValue SetCC = N->getOperand(0);
26089 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
26090 return SDValue();
26091
26092 switch (SelectB.getOpcode()) {
26093 default:
26094 return SDValue();
26095 case ISD::FMUL:
26096 case ISD::FSUB:
26097 case ISD::FADD:
26098 break;
26099 }
26100 if (SelectA != SelectB.getOperand(0))
26101 return SDValue();
26102
26103 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
26104 ISD::CondCode InverseCC =
26106 auto InverseSetCC =
26107 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
26108 SetCC.getOperand(1), InverseCC);
26109
26110 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
26111 {InverseSetCC, SelectB, SelectA});
26112}
26113
26114// vselect (v1i1 setcc) ->
26115// vselect (v1iXX setcc) (XX is the size of the compared operand type)
26116// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
26117// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
26118// such VSELECT.
26120 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
26121 return SwapResult;
26122
26123 SDValue N0 = N->getOperand(0);
26124 SDValue IfTrue = N->getOperand(1);
26125 SDValue IfFalse = N->getOperand(2);
26126 EVT ResVT = N->getValueType(0);
26127 EVT CCVT = N0.getValueType();
26128
26129 if (isAllActivePredicate(DAG, N0))
26130 return N->getOperand(1);
26131
26132 if (isAllInactivePredicate(N0))
26133 return N->getOperand(2);
26134
26135 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
26136 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
26137 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
26138 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
26139 // -> merge_pasthru_op A, B,{Bn,} C
26140 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
26141 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
26142 IfTrue->getOperand(0) == N0) {
26143 SmallVector<SDValue, 4> Ops(IfTrue->op_values());
26144 Ops[0] = N0;
26145 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
26146
26147 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
26148 }
26149 }
26150
26151 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
26152 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
26153 // supported types.
26154 SDValue SetCC = N->getOperand(0);
26155 if (SetCC.getOpcode() == ISD::SETCC &&
26156 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
26157 SDValue CmpLHS = SetCC.getOperand(0);
26158 EVT VT = CmpLHS.getValueType();
26159 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
26160 SDNode *SplatLHS = N->getOperand(1).getNode();
26161 SDNode *SplatRHS = N->getOperand(2).getNode();
26162 APInt SplatLHSVal;
26163 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
26164 VT.isSimple() &&
26165 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
26166 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
26167 VT.getSimpleVT().SimpleTy) &&
26168 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
26169 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
26171 unsigned NumElts = VT.getVectorNumElements();
26173 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
26174 VT.getScalarType()));
26175 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
26176
26177 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
26178 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
26179 return Or;
26180 }
26181 }
26182
26183 EVT CmpVT = N0.getOperand(0).getValueType();
26184 if (N0.getOpcode() != ISD::SETCC ||
26186 CCVT.getVectorElementType() != MVT::i1 ||
26188 return SDValue();
26189
26190 // Only combine when the result type is of the same size as the compared
26191 // operands.
26192 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
26193 return SDValue();
26194
26195 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
26196 N0.getOperand(0), N0.getOperand(1),
26197 cast<CondCodeSDNode>(N0.getOperand(2))->get());
26198 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
26199 IfTrue, IfFalse);
26200}
26201
26202/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
26203/// the compare-mask instructions rather than going via NZCV, even if LHS and
26204/// RHS are really scalar. This replaces any scalar setcc in the above pattern
26205/// with a vector one followed by a DUP shuffle on the result.
26208 SelectionDAG &DAG = DCI.DAG;
26209 SDValue N0 = N->getOperand(0);
26210 EVT ResVT = N->getValueType(0);
26211
26212 if (N0.getOpcode() != ISD::SETCC)
26213 return SDValue();
26214
26215 if (ResVT.isScalableVT())
26216 return SDValue();
26217
26218 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
26219 // scalar SetCCResultType. We also don't expect vectors, because we assume
26220 // that selects fed by vector SETCCs are canonicalized to VSELECT.
26221 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
26222 "Scalar-SETCC feeding SELECT has unexpected result type!");
26223
26224 // If NumMaskElts == 0, the comparison is larger than select result. The
26225 // largest real NEON comparison is 64-bits per lane, which means the result is
26226 // at most 32-bits and an illegal vector. Just bail out for now.
26227 EVT SrcVT = N0.getOperand(0).getValueType();
26228
26229 // Don't try to do this optimization when the setcc itself has i1 operands.
26230 // There are no legal vectors of i1, so this would be pointless. v1f16 is
26231 // ruled out to prevent the creation of setcc that need to be scalarized.
26232 if (SrcVT == MVT::i1 ||
26233 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
26234 return SDValue();
26235
26236 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
26237 if (!ResVT.isVector() || NumMaskElts == 0)
26238 return SDValue();
26239
26240 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
26242
26243 // Also bail out if the vector CCVT isn't the same size as ResVT.
26244 // This can happen if the SETCC operand size doesn't divide the ResVT size
26245 // (e.g., f64 vs v3f32).
26246 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
26247 return SDValue();
26248
26249 // Make sure we didn't create illegal types, if we're not supposed to.
26250 assert(DCI.isBeforeLegalize() ||
26251 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
26252
26253 // First perform a vector comparison, where lane 0 is the one we're interested
26254 // in.
26255 SDLoc DL(N0);
26256 SDValue LHS =
26257 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
26258 SDValue RHS =
26259 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
26260 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
26261
26262 // Now duplicate the comparison mask we want across all other lanes.
26263 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
26264 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
26265 Mask = DAG.getNode(ISD::BITCAST, DL,
26266 ResVT.changeVectorElementTypeToInteger(), Mask);
26267
26268 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
26269}
26270
26273 EVT VT = N->getValueType(0);
26274 SDLoc DL(N);
26275 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
26276 // 128bit vector version.
26277 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
26279 SmallVector<SDValue> Ops(N->ops());
26280 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
26281 DCI.DAG.getVTList(LVT), Ops)) {
26282 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
26283 DCI.DAG.getConstant(0, DL, MVT::i64));
26284 }
26285 }
26286
26287 if (N->getOpcode() == AArch64ISD::DUP) {
26288 // If the instruction is known to produce a scalar in SIMD registers, we can
26289 // duplicate it across the vector lanes using DUPLANE instead of moving it
26290 // to a GPR first. For example, this allows us to handle:
26291 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
26292 SDValue Op = N->getOperand(0);
26293 // FIXME: Ideally, we should be able to handle all instructions that
26294 // produce a scalar value in FPRs.
26295 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
26296 Op.getOpcode() == AArch64ISD::FCMGE ||
26297 Op.getOpcode() == AArch64ISD::FCMGT) {
26298 EVT ElemVT = VT.getVectorElementType();
26299 EVT ExpandedVT = VT;
26300 // Insert into a 128-bit vector to match DUPLANE's pattern.
26301 if (VT.getSizeInBits() != 128)
26302 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26303 128 / ElemVT.getSizeInBits());
26304 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
26305 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
26306 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
26307 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
26308 }
26309
26310 if (DCI.isAfterLegalizeDAG()) {
26311 // If scalar dup's operand is extract_vector_elt, try to combine them into
26312 // duplane. For example,
26313 //
26314 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
26315 // t18: v4i32 = AArch64ISD::DUP t21
26316 // ==>
26317 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
26318 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
26319 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
26320 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
26321 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
26322 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
26323 EXTRACT_VEC_ELT.getOperand(1));
26324 }
26325 }
26326 }
26327
26328 return performPostLD1Combine(N, DCI, false);
26329 }
26330
26331 return SDValue();
26332}
26333
26334/// Get rid of unnecessary NVCASTs (that don't change the type).
26336 if (N->getValueType(0) == N->getOperand(0).getValueType())
26337 return N->getOperand(0);
26338 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
26339 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
26340 N->getOperand(0).getOperand(0));
26341
26342 return SDValue();
26343}
26344
26345// If all users of the globaladdr are of the form (globaladdr + constant), find
26346// the smallest constant, fold it into the globaladdr's offset and rewrite the
26347// globaladdr as (globaladdr + constant) - constant.
26349 const AArch64Subtarget *Subtarget,
26350 const TargetMachine &TM) {
26351 auto *GN = cast<GlobalAddressSDNode>(N);
26352 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
26354 return SDValue();
26355
26356 uint64_t MinOffset = -1ull;
26357 for (SDNode *N : GN->users()) {
26358 if (N->getOpcode() != ISD::ADD)
26359 return SDValue();
26360 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
26361 if (!C)
26362 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
26363 if (!C)
26364 return SDValue();
26365 MinOffset = std::min(MinOffset, C->getZExtValue());
26366 }
26367 uint64_t Offset = MinOffset + GN->getOffset();
26368
26369 // Require that the new offset is larger than the existing one. Otherwise, we
26370 // can end up oscillating between two possible DAGs, for example,
26371 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
26372 if (Offset <= uint64_t(GN->getOffset()))
26373 return SDValue();
26374
26375 // Check whether folding this offset is legal. It must not go out of bounds of
26376 // the referenced object to avoid violating the code model, and must be
26377 // smaller than 2^20 because this is the largest offset expressible in all
26378 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
26379 // stores an immediate signed 21 bit offset.)
26380 //
26381 // This check also prevents us from folding negative offsets, which will end
26382 // up being treated in the same way as large positive ones. They could also
26383 // cause code model violations, and aren't really common enough to matter.
26384 if (Offset >= (1 << 20))
26385 return SDValue();
26386
26387 const GlobalValue *GV = GN->getGlobal();
26388 Type *T = GV->getValueType();
26389 if (!T->isSized() ||
26391 return SDValue();
26392
26393 SDLoc DL(GN);
26394 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
26395 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
26396 DAG.getConstant(MinOffset, DL, MVT::i64));
26397}
26398
26400 const AArch64Subtarget *Subtarget) {
26401 SDValue BR = N->getOperand(0);
26402 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
26403 !BR.getValueType().isScalarInteger())
26404 return SDValue();
26405
26406 SDLoc DL(N);
26407 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
26408}
26409
26410// Turns the vector of indices into a vector of byte offstes by scaling Offset
26411// by (BitWidth / 8).
26413 SDLoc DL, unsigned BitWidth) {
26414 assert(Offset.getValueType().isScalableVector() &&
26415 "This method is only for scalable vectors of offsets");
26416
26417 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
26418 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
26419
26420 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
26421}
26422
26423/// Check if the value of \p OffsetInBytes can be used as an immediate for
26424/// the gather load/prefetch and scatter store instructions with vector base and
26425/// immediate offset addressing mode:
26426///
26427/// [<Zn>.[S|D]{, #<imm>}]
26428///
26429/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26430inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
26431 unsigned ScalarSizeInBytes) {
26432 // The immediate is not a multiple of the scalar size.
26433 if (OffsetInBytes % ScalarSizeInBytes)
26434 return false;
26435
26436 // The immediate is out of range.
26437 if (OffsetInBytes / ScalarSizeInBytes > 31)
26438 return false;
26439
26440 return true;
26441}
26442
26443/// Check if the value of \p Offset represents a valid immediate for the SVE
26444/// gather load/prefetch and scatter store instructiona with vector base and
26445/// immediate offset addressing mode:
26446///
26447/// [<Zn>.[S|D]{, #<imm>}]
26448///
26449/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26451 unsigned ScalarSizeInBytes) {
26452 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
26453 return OffsetConst && isValidImmForSVEVecImmAddrMode(
26454 OffsetConst->getZExtValue(), ScalarSizeInBytes);
26455}
26456
26458 unsigned Opcode,
26459 bool OnlyPackedOffsets = true) {
26460 const SDValue Src = N->getOperand(2);
26461 const EVT SrcVT = Src->getValueType(0);
26462 assert(SrcVT.isScalableVector() &&
26463 "Scatter stores are only possible for SVE vectors");
26464
26465 SDLoc DL(N);
26466 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
26467
26468 // Make sure that source data will fit into an SVE register
26470 return SDValue();
26471
26472 // For FPs, ACLE only supports _packed_ single and double precision types.
26473 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
26474 if (SrcElVT.isFloatingPoint())
26475 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
26476 ((Opcode != AArch64ISD::SST1Q_PRED &&
26477 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
26478 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
26479 return SDValue();
26480
26481 // Depending on the addressing mode, this is either a pointer or a vector of
26482 // pointers (that fits into one register)
26483 SDValue Base = N->getOperand(4);
26484 // Depending on the addressing mode, this is either a single offset or a
26485 // vector of offsets (that fits into one register)
26486 SDValue Offset = N->getOperand(5);
26487
26488 // For "scalar + vector of indices", just scale the indices. This only
26489 // applies to non-temporal scatters because there's no instruction that takes
26490 // indices.
26491 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
26492 Offset =
26494 Opcode = AArch64ISD::SSTNT1_PRED;
26495 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
26496 Offset =
26498 Opcode = AArch64ISD::SST1Q_PRED;
26499 }
26500
26501 // In the case of non-temporal gather loads there's only one SVE instruction
26502 // per data-size: "scalar + vector", i.e.
26503 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26504 // Since we do have intrinsics that allow the arguments to be in a different
26505 // order, we may need to swap them to match the spec.
26506 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
26507 Offset.getValueType().isVector())
26509
26510 // SST1_IMM requires that the offset is an immediate that is:
26511 // * a multiple of #SizeInBytes,
26512 // * in the range [0, 31 x #SizeInBytes],
26513 // where #SizeInBytes is the size in bytes of the stored items. For
26514 // immediates outside that range and non-immediate scalar offsets use SST1 or
26515 // SST1_UXTW instead.
26516 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
26518 SrcVT.getScalarSizeInBits() / 8)) {
26519 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26520 Opcode = AArch64ISD::SST1_UXTW_PRED;
26521 else
26522 Opcode = AArch64ISD::SST1_PRED;
26523
26525 }
26526 }
26527
26528 auto &TLI = DAG.getTargetLoweringInfo();
26529 if (!TLI.isTypeLegal(Base.getValueType()))
26530 return SDValue();
26531
26532 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
26533 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26534 // nxv2i64. Legalize accordingly.
26535 if (!OnlyPackedOffsets &&
26536 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26537 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26538
26539 if (!TLI.isTypeLegal(Offset.getValueType()))
26540 return SDValue();
26541
26542 // Source value type that is representable in hardware
26543 EVT HwSrcVt = getSVEContainerType(SrcVT);
26544
26545 // Keep the original type of the input data to store - this is needed to be
26546 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
26547 // FP values we want the integer equivalent, so just use HwSrcVt.
26548 SDValue InputVT = DAG.getValueType(SrcVT);
26549 if (SrcVT.isFloatingPoint())
26550 InputVT = DAG.getValueType(HwSrcVt);
26551
26552 SDVTList VTs = DAG.getVTList(MVT::Other);
26553 SDValue SrcNew;
26554
26555 if (Src.getValueType().isFloatingPoint())
26556 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
26557 else
26558 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
26559
26560 SDValue Ops[] = {N->getOperand(0), // Chain
26561 SrcNew,
26562 N->getOperand(3), // Pg
26563 Base,
26564 Offset,
26565 InputVT};
26566
26567 return DAG.getNode(Opcode, DL, VTs, Ops);
26568}
26569
26571 unsigned Opcode,
26572 bool OnlyPackedOffsets = true) {
26573 const EVT RetVT = N->getValueType(0);
26574 assert(RetVT.isScalableVector() &&
26575 "Gather loads are only possible for SVE vectors");
26576
26577 SDLoc DL(N);
26578
26579 // Make sure that the loaded data will fit into an SVE register
26581 return SDValue();
26582
26583 // Depending on the addressing mode, this is either a pointer or a vector of
26584 // pointers (that fits into one register)
26585 SDValue Base = N->getOperand(3);
26586 // Depending on the addressing mode, this is either a single offset or a
26587 // vector of offsets (that fits into one register)
26588 SDValue Offset = N->getOperand(4);
26589
26590 // For "scalar + vector of indices", scale the indices to obtain unscaled
26591 // offsets. This applies to non-temporal and quadword gathers, which do not
26592 // have an addressing mode with scaled offset.
26593 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
26595 RetVT.getScalarSizeInBits());
26596 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
26597 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
26599 RetVT.getScalarSizeInBits());
26600 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
26601 }
26602
26603 // In the case of non-temporal gather loads and quadword gather loads there's
26604 // only one addressing mode : "vector + scalar", e.g.
26605 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26606 // Since we do have intrinsics that allow the arguments to be in a different
26607 // order, we may need to swap them to match the spec.
26608 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
26609 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
26610 Offset.getValueType().isVector())
26612
26613 // GLD{FF}1_IMM requires that the offset is an immediate that is:
26614 // * a multiple of #SizeInBytes,
26615 // * in the range [0, 31 x #SizeInBytes],
26616 // where #SizeInBytes is the size in bytes of the loaded items. For
26617 // immediates outside that range and non-immediate scalar offsets use
26618 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26619 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
26620 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26622 RetVT.getScalarSizeInBits() / 8)) {
26623 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26624 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26625 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26626 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26627 else
26628 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26629 ? AArch64ISD::GLD1_MERGE_ZERO
26630 : AArch64ISD::GLDFF1_MERGE_ZERO;
26631
26633 }
26634 }
26635
26636 auto &TLI = DAG.getTargetLoweringInfo();
26637 if (!TLI.isTypeLegal(Base.getValueType()))
26638 return SDValue();
26639
26640 // Some gather load variants allow unpacked offsets, but only as nxv2i32
26641 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26642 // nxv2i64. Legalize accordingly.
26643 if (!OnlyPackedOffsets &&
26644 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26645 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26646
26647 // Return value type that is representable in hardware
26648 EVT HwRetVt = getSVEContainerType(RetVT);
26649
26650 // Keep the original output value type around - this is needed to be able to
26651 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26652 // values we want the integer equivalent, so just use HwRetVT.
26653 SDValue OutVT = DAG.getValueType(RetVT);
26654 if (RetVT.isFloatingPoint())
26655 OutVT = DAG.getValueType(HwRetVt);
26656
26657 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
26658 SDValue Ops[] = {N->getOperand(0), // Chain
26659 N->getOperand(2), // Pg
26660 Base, Offset, OutVT};
26661
26662 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
26663 SDValue LoadChain = SDValue(Load.getNode(), 1);
26664
26665 if (RetVT.isInteger() && (RetVT != HwRetVt))
26666 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
26667
26668 // If the original return value was FP, bitcast accordingly. Doing it here
26669 // means that we can avoid adding TableGen patterns for FPs.
26670 if (RetVT.isFloatingPoint())
26671 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
26672
26673 return DAG.getMergeValues({Load, LoadChain}, DL);
26674}
26675
26676static SDValue
26678 SelectionDAG &DAG) {
26679 SDLoc DL(N);
26680 SDValue Src = N->getOperand(0);
26681 unsigned Opc = Src->getOpcode();
26682
26683 // Sign extend of an unsigned unpack -> signed unpack
26684 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
26685
26686 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
26687 : AArch64ISD::SUNPKLO;
26688
26689 // Push the sign extend to the operand of the unpack
26690 // This is necessary where, for example, the operand of the unpack
26691 // is another unpack:
26692 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
26693 // ->
26694 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
26695 // ->
26696 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
26697 SDValue ExtOp = Src->getOperand(0);
26698 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
26699 EVT EltTy = VT.getVectorElementType();
26700 (void)EltTy;
26701
26702 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
26703 "Sign extending from an invalid type");
26704
26705 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
26706
26708 ExtOp, DAG.getValueType(ExtVT));
26709
26710 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
26711 }
26712
26713 if (DCI.isBeforeLegalizeOps())
26714 return SDValue();
26715
26717 return SDValue();
26718
26719 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
26720 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
26721 unsigned NewOpc;
26722 unsigned MemVTOpNum = 4;
26723 switch (Opc) {
26724 case AArch64ISD::LD1_MERGE_ZERO:
26725 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
26726 MemVTOpNum = 3;
26727 break;
26728 case AArch64ISD::LDNF1_MERGE_ZERO:
26729 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
26730 MemVTOpNum = 3;
26731 break;
26732 case AArch64ISD::LDFF1_MERGE_ZERO:
26733 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
26734 MemVTOpNum = 3;
26735 break;
26736 case AArch64ISD::GLD1_MERGE_ZERO:
26737 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
26738 break;
26739 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26740 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
26741 break;
26742 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26743 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
26744 break;
26745 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26746 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
26747 break;
26748 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26749 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
26750 break;
26751 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26752 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
26753 break;
26754 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26755 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
26756 break;
26757 case AArch64ISD::GLDFF1_MERGE_ZERO:
26758 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
26759 break;
26760 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
26761 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
26762 break;
26763 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
26764 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
26765 break;
26766 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
26767 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
26768 break;
26769 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
26770 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
26771 break;
26772 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
26773 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
26774 break;
26775 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
26776 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
26777 break;
26778 case AArch64ISD::GLDNT1_MERGE_ZERO:
26779 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
26780 break;
26781 default:
26782 return SDValue();
26783 }
26784
26785 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
26786 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
26787
26788 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
26789 return SDValue();
26790
26791 EVT DstVT = N->getValueType(0);
26792 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
26793
26795 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
26796 Ops.push_back(Src->getOperand(I));
26797
26798 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
26799 DCI.CombineTo(N, ExtLoad);
26800 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
26801
26802 // Return N so it doesn't get rechecked
26803 return SDValue(N, 0);
26804}
26805
26806/// Legalize the gather prefetch (scalar + vector addressing mode) when the
26807/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
26808/// != nxv2i32) do not need legalization.
26810 const unsigned OffsetPos = 4;
26811 SDValue Offset = N->getOperand(OffsetPos);
26812
26813 // Not an unpacked vector, bail out.
26814 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
26815 return SDValue();
26816
26817 // Extend the unpacked offset vector to 64-bit lanes.
26818 SDLoc DL(N);
26819 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
26820 SmallVector<SDValue, 5> Ops(N->ops());
26821 // Replace the offset operand with the 64-bit one.
26822 Ops[OffsetPos] = Offset;
26823
26824 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
26825}
26826
26827/// Combines a node carrying the intrinsic
26828/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
26829/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
26830/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
26831/// sve gather prefetch instruction with vector plus immediate addressing mode.
26833 unsigned ScalarSizeInBytes) {
26834 const unsigned ImmPos = 4, OffsetPos = 3;
26835 // No need to combine the node if the immediate is valid...
26836 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
26837 return SDValue();
26838
26839 // ...otherwise swap the offset base with the offset...
26840 SmallVector<SDValue, 5> Ops(N->ops());
26841 std::swap(Ops[ImmPos], Ops[OffsetPos]);
26842 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
26843 // `aarch64_sve_prfb_gather_uxtw_index`.
26844 SDLoc DL(N);
26845 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
26846 MVT::i64);
26847
26848 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
26849}
26850
26851// Return true if the vector operation can guarantee only the first lane of its
26852// result contains data, with all bits in other lanes set to zero.
26854 switch (Op.getOpcode()) {
26855 default:
26856 return false;
26857 case AArch64ISD::ANDV_PRED:
26858 case AArch64ISD::EORV_PRED:
26859 case AArch64ISD::FADDA_PRED:
26860 case AArch64ISD::FADDV_PRED:
26861 case AArch64ISD::FMAXNMV_PRED:
26862 case AArch64ISD::FMAXV_PRED:
26863 case AArch64ISD::FMINNMV_PRED:
26864 case AArch64ISD::FMINV_PRED:
26865 case AArch64ISD::ORV_PRED:
26866 case AArch64ISD::SADDV_PRED:
26867 case AArch64ISD::SMAXV_PRED:
26868 case AArch64ISD::SMINV_PRED:
26869 case AArch64ISD::UADDV_PRED:
26870 case AArch64ISD::UMAXV_PRED:
26871 case AArch64ISD::UMINV_PRED:
26872 return true;
26873 }
26874}
26875
26877 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
26878 SDValue InsertVec = N->getOperand(0);
26879 SDValue InsertElt = N->getOperand(1);
26880 SDValue InsertIdx = N->getOperand(2);
26881
26882 // We only care about inserts into the first element...
26883 if (!isNullConstant(InsertIdx))
26884 return SDValue();
26885 // ...of a zero'd vector...
26887 return SDValue();
26888 // ...where the inserted data was previously extracted...
26889 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26890 return SDValue();
26891
26892 SDValue ExtractVec = InsertElt.getOperand(0);
26893 SDValue ExtractIdx = InsertElt.getOperand(1);
26894
26895 // ...from the first element of a vector.
26896 if (!isNullConstant(ExtractIdx))
26897 return SDValue();
26898
26899 // If we get here we are effectively trying to zero lanes 1-N of a vector.
26900
26901 // Ensure there's no type conversion going on.
26902 if (N->getValueType(0) != ExtractVec.getValueType())
26903 return SDValue();
26904
26905 if (!isLanes1toNKnownZero(ExtractVec))
26906 return SDValue();
26907
26908 // The explicit zeroing is redundant.
26909 return ExtractVec;
26910}
26911
26912static SDValue
26915 return Res;
26916
26917 return performPostLD1Combine(N, DCI, true);
26918}
26919
26922 const AArch64Subtarget *Subtarget) {
26923 SDValue N0 = N->getOperand(0);
26924 EVT VT = N->getValueType(0);
26925
26926 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
26927 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
26928 return SDValue();
26929
26930 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
26931 EVT EltVT = VT.getVectorElementType();
26932 return EltVT == MVT::f32 || EltVT == MVT::f64;
26933 };
26934
26935 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
26936 // We purposefully don't care about legality of the nodes here as we know
26937 // they can be split down into something legal.
26938 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
26939 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
26940 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
26941 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
26942 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
26943 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
26944 LN0->getChain(), LN0->getBasePtr(),
26945 N0.getValueType(), LN0->getMemOperand());
26946 DCI.CombineTo(N, ExtLoad);
26947 DCI.CombineTo(
26948 N0.getNode(),
26949 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
26950 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
26951 ExtLoad.getValue(1));
26952 return SDValue(N, 0); // Return N so it doesn't get rechecked!
26953 }
26954
26955 return SDValue();
26956}
26957
26959 const AArch64Subtarget *Subtarget) {
26960 EVT VT = N->getValueType(0);
26961
26962 // Don't expand for NEON, SVE2 or SME
26963 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
26964 return SDValue();
26965
26966 SDLoc DL(N);
26967
26968 SDValue Mask = N->getOperand(0);
26969 SDValue In1 = N->getOperand(1);
26970 SDValue In2 = N->getOperand(2);
26971
26972 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
26973 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
26974 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
26975 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
26976}
26977
26979 EVT VT = N->getValueType(0);
26980
26981 SDValue Insert = N->getOperand(0);
26982 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
26983 return SDValue();
26984
26985 if (!Insert.getOperand(0).isUndef())
26986 return SDValue();
26987
26988 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
26989 uint64_t IdxDupLane = N->getConstantOperandVal(1);
26990 if (IdxInsert != 0 || IdxDupLane != 0)
26991 return SDValue();
26992
26993 SDValue Bitcast = Insert.getOperand(1);
26994 if (Bitcast.getOpcode() != ISD::BITCAST)
26995 return SDValue();
26996
26997 SDValue Subvec = Bitcast.getOperand(0);
26998 EVT SubvecVT = Subvec.getValueType();
26999 if (!SubvecVT.is128BitVector())
27000 return SDValue();
27001 EVT NewSubvecVT =
27003
27004 SDLoc DL(N);
27005 SDValue NewInsert =
27006 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
27007 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
27008 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
27009 NewInsert, N->getOperand(1));
27010 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
27011}
27012
27013// Try to combine mull with uzp1.
27016 SelectionDAG &DAG) {
27017 if (DCI.isBeforeLegalizeOps())
27018 return SDValue();
27019
27020 SDValue LHS = N->getOperand(0);
27021 SDValue RHS = N->getOperand(1);
27022
27023 SDValue ExtractHigh;
27024 SDValue ExtractLow;
27025 SDValue TruncHigh;
27026 SDValue TruncLow;
27027 SDLoc DL(N);
27028
27029 // Check the operands are trunc and extract_high.
27031 RHS.getOpcode() == ISD::TRUNCATE) {
27032 TruncHigh = RHS;
27033 if (LHS.getOpcode() == ISD::BITCAST)
27034 ExtractHigh = LHS.getOperand(0);
27035 else
27036 ExtractHigh = LHS;
27038 LHS.getOpcode() == ISD::TRUNCATE) {
27039 TruncHigh = LHS;
27040 if (RHS.getOpcode() == ISD::BITCAST)
27041 ExtractHigh = RHS.getOperand(0);
27042 else
27043 ExtractHigh = RHS;
27044 } else
27045 return SDValue();
27046
27047 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27048 // with uzp1.
27049 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27050 SDValue TruncHighOp = TruncHigh.getOperand(0);
27051 EVT TruncHighOpVT = TruncHighOp.getValueType();
27052 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
27053 DAG.isSplatValue(TruncHighOp, false))
27054 return SDValue();
27055
27056 // Check there is other extract_high with same source vector.
27057 // For example,
27058 //
27059 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
27060 // t12: v4i16 = truncate t11
27061 // t31: v4i32 = AArch64ISD::SMULL t18, t12
27062 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
27063 // t16: v4i16 = truncate t15
27064 // t30: v4i32 = AArch64ISD::SMULL t23, t1
27065 //
27066 // This dagcombine assumes the two extract_high uses same source vector in
27067 // order to detect the pair of the mull. If they have different source vector,
27068 // this code will not work.
27069 // TODO: Should also try to look through a bitcast.
27070 bool HasFoundMULLow = true;
27071 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
27072 if (ExtractHighSrcVec->use_size() != 2)
27073 HasFoundMULLow = false;
27074
27075 // Find ExtractLow.
27076 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
27077 if (User == ExtractHigh.getNode())
27078 continue;
27079
27080 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27082 HasFoundMULLow = false;
27083 break;
27084 }
27085
27086 ExtractLow.setNode(User);
27087 }
27088
27089 if (!ExtractLow || !ExtractLow->hasOneUse())
27090 HasFoundMULLow = false;
27091
27092 // Check ExtractLow's user.
27093 if (HasFoundMULLow) {
27094 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
27095 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
27096 HasFoundMULLow = false;
27097 } else {
27098 if (ExtractLowUser->getOperand(0) == ExtractLow) {
27099 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
27100 TruncLow = ExtractLowUser->getOperand(1);
27101 else
27102 HasFoundMULLow = false;
27103 } else {
27104 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
27105 TruncLow = ExtractLowUser->getOperand(0);
27106 else
27107 HasFoundMULLow = false;
27108 }
27109 }
27110 }
27111
27112 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27113 // with uzp1.
27114 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27115 EVT TruncHighVT = TruncHigh.getValueType();
27116 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27117 SDValue TruncLowOp =
27118 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
27119 EVT TruncLowOpVT = TruncLowOp.getValueType();
27120 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
27121 DAG.isSplatValue(TruncLowOp, false)))
27122 return SDValue();
27123
27124 // Create uzp1, extract_high and extract_low.
27125 if (TruncHighOpVT != UZP1VT)
27126 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
27127 if (TruncLowOpVT != UZP1VT)
27128 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
27129
27130 SDValue UZP1 =
27131 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
27132 SDValue HighIdxCst =
27133 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
27134 SDValue NewTruncHigh =
27135 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
27136 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
27137
27138 if (HasFoundMULLow) {
27139 EVT TruncLowVT = TruncLow.getValueType();
27140 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
27141 UZP1, ExtractLow.getOperand(1));
27142 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
27143 }
27144
27145 return SDValue(N, 0);
27146}
27147
27150 SelectionDAG &DAG) {
27151 if (SDValue Val =
27153 return Val;
27154
27155 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
27156 return Val;
27157
27158 return SDValue();
27159}
27160
27161static SDValue
27163 SelectionDAG &DAG) {
27164 // Let's do below transform.
27165 //
27166 // t34: v4i32 = AArch64ISD::UADDLV t2
27167 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
27168 // t7: i64 = zero_extend t35
27169 // t20: v1i64 = scalar_to_vector t7
27170 // ==>
27171 // t34: v4i32 = AArch64ISD::UADDLV t2
27172 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
27173 // t40: v1i64 = AArch64ISD::NVCAST t39
27174 if (DCI.isBeforeLegalizeOps())
27175 return SDValue();
27176
27177 EVT VT = N->getValueType(0);
27178 if (VT != MVT::v1i64)
27179 return SDValue();
27180
27181 SDValue ZEXT = N->getOperand(0);
27182 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
27183 return SDValue();
27184
27185 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
27186 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
27187 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
27188 return SDValue();
27189
27190 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
27191 return SDValue();
27192
27193 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
27194 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
27195 UADDLV.getValueType() != MVT::v4i32 ||
27196 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
27197 return SDValue();
27198
27199 // Let's generate new sequence with AArch64ISD::NVCAST.
27200 SDLoc DL(N);
27201 SDValue EXTRACT_SUBVEC =
27202 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
27203 DAG.getConstant(0, DL, MVT::i64));
27204 SDValue NVCAST =
27205 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
27206
27207 return NVCAST;
27208}
27209
27210/// If the operand is a bitwise AND with a constant RHS, and the shift has a
27211/// constant RHS and is the only use, we can pull it out of the shift, i.e.
27212///
27213/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
27214///
27215/// We prefer this canonical form to match existing isel patterns.
27218 SelectionDAG &DAG) {
27219 if (DCI.isBeforeLegalizeOps())
27220 return SDValue();
27221
27222 SDValue Op0 = N->getOperand(0);
27223 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
27224 return SDValue();
27225
27226 SDValue C1 = Op0->getOperand(1);
27227 SDValue C2 = N->getOperand(1);
27228 if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
27229 return SDValue();
27230
27231 // Might be folded into shifted op, do not lower.
27232 if (N->hasOneUse()) {
27233 unsigned UseOpc = N->user_begin()->getOpcode();
27234 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
27235 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
27236 return SDValue();
27237 }
27238
27239 SDLoc DL(N);
27240 EVT VT = N->getValueType(0);
27241
27242 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
27243 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
27244 // causing infinite loop. Result may also be worse.
27245 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
27246 if (!isa<ConstantSDNode>(NewRHS))
27247 return SDValue();
27248
27249 SDValue X = Op0->getOperand(0);
27250 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
27251 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
27252}
27253
27255 unsigned IntrinsicID = N->getConstantOperandVal(1);
27256 auto Register =
27257 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
27258 : AArch64SysReg::RNDRRS);
27259 SDLoc DL(N);
27260 SDValue A = DAG.getNode(
27261 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
27262 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
27263 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
27264 DAG.getConstant(0, DL, MVT::i32),
27265 DAG.getConstant(0, DL, MVT::i32),
27266 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
27267 return DAG.getMergeValues(
27268 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
27269}
27270
27272 DAGCombinerInfo &DCI) const {
27273 SelectionDAG &DAG = DCI.DAG;
27274 switch (N->getOpcode()) {
27275 default:
27276 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
27277 break;
27278 case ISD::VECREDUCE_AND:
27279 case ISD::VECREDUCE_OR:
27280 case ISD::VECREDUCE_XOR:
27281 return performVecReduceBitwiseCombine(N, DCI, DAG);
27282 case ISD::ADD:
27283 case ISD::SUB:
27284 return performAddSubCombine(N, DCI);
27285 case ISD::BUILD_VECTOR:
27286 return performBuildVectorCombine(N, DCI, DAG);
27287 case ISD::SMIN:
27288 return performSMINCombine(N, DAG);
27289 case ISD::TRUNCATE:
27290 return performTruncateCombine(N, DAG, DCI);
27291 case AArch64ISD::ANDS:
27292 return performFlagSettingCombine(N, DCI, ISD::AND);
27293 case AArch64ISD::ADC:
27294 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27295 return R;
27296 return foldADCToCINC(N, DAG);
27297 case AArch64ISD::SBC:
27298 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
27299 case AArch64ISD::ADCS:
27300 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27301 return R;
27302 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
27303 case AArch64ISD::SBCS:
27304 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
27305 return R;
27306 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
27307 case AArch64ISD::BICi: {
27309 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
27310 APInt DemandedElts =
27311 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
27312
27314 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
27315 return SDValue();
27316
27317 break;
27318 }
27319 case ISD::XOR:
27320 return performXorCombine(N, DAG, DCI, Subtarget);
27321 case ISD::MUL:
27322 return performMulCombine(N, DAG, DCI, Subtarget);
27323 case ISD::SINT_TO_FP:
27324 case ISD::UINT_TO_FP:
27325 return performIntToFpCombine(N, DAG, DCI, Subtarget);
27326 case ISD::FP_TO_SINT:
27327 case ISD::FP_TO_UINT:
27330 return performFpToIntCombine(N, DAG, DCI, Subtarget);
27331 case ISD::OR:
27332 return performORCombine(N, DCI, Subtarget, *this);
27333 case ISD::AND:
27334 return performANDCombine(N, DCI);
27335 case ISD::FADD:
27336 return performFADDCombine(N, DCI);
27338 return performIntrinsicCombine(N, DCI, Subtarget);
27339 case ISD::ANY_EXTEND:
27340 case ISD::ZERO_EXTEND:
27341 case ISD::SIGN_EXTEND:
27342 return performExtendCombine(N, DCI, DAG);
27344 return performSignExtendInRegCombine(N, DCI, DAG);
27346 return performConcatVectorsCombine(N, DCI, DAG);
27348 return performExtractSubvectorCombine(N, DCI, DAG);
27350 return performInsertSubvectorCombine(N, DCI, DAG);
27351 case ISD::SELECT:
27352 return performSelectCombine(N, DCI);
27353 case ISD::VSELECT:
27354 return performVSelectCombine(N, DCI.DAG);
27355 case ISD::SETCC:
27356 return performSETCCCombine(N, DCI, DAG);
27357 case ISD::LOAD:
27358 return performLOADCombine(N, DCI, DAG, Subtarget);
27359 case ISD::STORE:
27360 return performSTORECombine(N, DCI, DAG, Subtarget);
27361 case ISD::MSTORE:
27362 return performMSTORECombine(N, DCI, DAG, Subtarget);
27363 case ISD::MGATHER:
27364 case ISD::MSCATTER:
27366 return performMaskedGatherScatterCombine(N, DCI, DAG);
27367 case ISD::FP_EXTEND:
27368 return performFPExtendCombine(N, DAG, DCI, Subtarget);
27369 case AArch64ISD::BRCOND:
27370 return performBRCONDCombine(N, DCI, DAG);
27371 case AArch64ISD::TBNZ:
27372 case AArch64ISD::TBZ:
27373 return performTBZCombine(N, DCI, DAG);
27374 case AArch64ISD::CSEL:
27375 return performCSELCombine(N, DCI, DAG);
27376 case AArch64ISD::DUP:
27377 case AArch64ISD::DUPLANE8:
27378 case AArch64ISD::DUPLANE16:
27379 case AArch64ISD::DUPLANE32:
27380 case AArch64ISD::DUPLANE64:
27381 return performDUPCombine(N, DCI);
27382 case AArch64ISD::DUPLANE128:
27383 return performDupLane128Combine(N, DAG);
27384 case AArch64ISD::NVCAST:
27385 return performNVCASTCombine(N, DAG);
27386 case AArch64ISD::SPLICE:
27387 return performSpliceCombine(N, DAG);
27388 case AArch64ISD::UUNPKLO:
27389 case AArch64ISD::UUNPKHI:
27390 return performUnpackCombine(N, DAG, Subtarget);
27391 case AArch64ISD::UZP1:
27392 case AArch64ISD::UZP2:
27393 return performUzpCombine(N, DAG, Subtarget);
27394 case AArch64ISD::SETCC_MERGE_ZERO:
27395 return performSetccMergeZeroCombine(N, DCI);
27396 case AArch64ISD::REINTERPRET_CAST:
27398 case AArch64ISD::GLD1_MERGE_ZERO:
27399 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27400 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27401 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27402 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27403 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27404 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27405 case AArch64ISD::GLD1S_MERGE_ZERO:
27406 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
27407 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
27408 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
27409 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
27410 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
27411 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
27412 return performGLD1Combine(N, DAG);
27413 case AArch64ISD::VASHR:
27414 case AArch64ISD::VLSHR:
27415 return performVectorShiftCombine(N, *this, DCI);
27416 case AArch64ISD::SUNPKLO:
27417 return performSunpkloCombine(N, DAG);
27418 case AArch64ISD::BSP:
27419 return performBSPExpandForSVE(N, DAG, Subtarget);
27421 return performInsertVectorEltCombine(N, DCI);
27423 return performExtractVectorEltCombine(N, DCI, Subtarget);
27424 case ISD::VECREDUCE_ADD:
27425 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
27427 return performActiveLaneMaskCombine(N, DCI, Subtarget);
27428 case AArch64ISD::UADDV:
27429 return performUADDVCombine(N, DAG);
27430 case AArch64ISD::SMULL:
27431 case AArch64ISD::UMULL:
27432 case AArch64ISD::PMULL:
27433 return performMULLCombine(N, DCI, DAG);
27436 switch (N->getConstantOperandVal(1)) {
27437 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
27438 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
27439 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
27440 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
27441 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
27442 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
27443 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
27444 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
27445 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
27446 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
27447 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
27448 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
27449 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
27450 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
27451 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
27452 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
27454 case Intrinsic::aarch64_neon_ld2:
27455 case Intrinsic::aarch64_neon_ld3:
27456 case Intrinsic::aarch64_neon_ld4:
27457 case Intrinsic::aarch64_neon_ld1x2:
27458 case Intrinsic::aarch64_neon_ld1x3:
27459 case Intrinsic::aarch64_neon_ld1x4:
27460 case Intrinsic::aarch64_neon_ld2lane:
27461 case Intrinsic::aarch64_neon_ld3lane:
27462 case Intrinsic::aarch64_neon_ld4lane:
27463 case Intrinsic::aarch64_neon_ld2r:
27464 case Intrinsic::aarch64_neon_ld3r:
27465 case Intrinsic::aarch64_neon_ld4r:
27466 case Intrinsic::aarch64_neon_st2:
27467 case Intrinsic::aarch64_neon_st3:
27468 case Intrinsic::aarch64_neon_st4:
27469 case Intrinsic::aarch64_neon_st1x2:
27470 case Intrinsic::aarch64_neon_st1x3:
27471 case Intrinsic::aarch64_neon_st1x4:
27472 case Intrinsic::aarch64_neon_st2lane:
27473 case Intrinsic::aarch64_neon_st3lane:
27474 case Intrinsic::aarch64_neon_st4lane:
27475 return performNEONPostLDSTCombine(N, DCI, DAG);
27476 case Intrinsic::aarch64_sve_ldnt1:
27477 return performLDNT1Combine(N, DAG);
27478 case Intrinsic::aarch64_sve_ld1rq:
27479 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
27480 case Intrinsic::aarch64_sve_ld1ro:
27481 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
27482 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
27483 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27484 case Intrinsic::aarch64_sve_ldnt1_gather:
27485 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27486 case Intrinsic::aarch64_sve_ldnt1_gather_index:
27487 return performGatherLoadCombine(N, DAG,
27488 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
27489 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
27490 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27491 case Intrinsic::aarch64_sve_ld1:
27492 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
27493 case Intrinsic::aarch64_sve_ldnf1:
27494 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
27495 case Intrinsic::aarch64_sve_ldff1:
27496 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
27497 case Intrinsic::aarch64_sve_st1:
27498 return performST1Combine(N, DAG);
27499 case Intrinsic::aarch64_sve_stnt1:
27500 return performSTNT1Combine(N, DAG);
27501 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
27502 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27503 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
27504 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27505 case Intrinsic::aarch64_sve_stnt1_scatter:
27506 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27507 case Intrinsic::aarch64_sve_stnt1_scatter_index:
27508 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
27509 case Intrinsic::aarch64_sve_ld1_gather:
27510 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
27511 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
27512 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
27513 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
27514 case Intrinsic::aarch64_sve_ld1q_gather_index:
27515 return performGatherLoadCombine(N, DAG,
27516 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
27517 case Intrinsic::aarch64_sve_ld1_gather_index:
27518 return performGatherLoadCombine(N, DAG,
27519 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
27520 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
27521 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
27522 /*OnlyPackedOffsets=*/false);
27523 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
27524 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
27525 /*OnlyPackedOffsets=*/false);
27526 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
27527 return performGatherLoadCombine(N, DAG,
27528 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
27529 /*OnlyPackedOffsets=*/false);
27530 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
27531 return performGatherLoadCombine(N, DAG,
27532 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
27533 /*OnlyPackedOffsets=*/false);
27534 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
27535 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
27536 case Intrinsic::aarch64_sve_ldff1_gather:
27537 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
27538 case Intrinsic::aarch64_sve_ldff1_gather_index:
27539 return performGatherLoadCombine(N, DAG,
27540 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
27541 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
27542 return performGatherLoadCombine(N, DAG,
27543 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
27544 /*OnlyPackedOffsets=*/false);
27545 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
27546 return performGatherLoadCombine(N, DAG,
27547 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
27548 /*OnlyPackedOffsets=*/false);
27549 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
27550 return performGatherLoadCombine(N, DAG,
27551 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
27552 /*OnlyPackedOffsets=*/false);
27553 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
27554 return performGatherLoadCombine(N, DAG,
27555 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
27556 /*OnlyPackedOffsets=*/false);
27557 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
27558 return performGatherLoadCombine(N, DAG,
27559 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
27560 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
27561 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
27562 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
27563 case Intrinsic::aarch64_sve_st1q_scatter_index:
27564 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
27565 case Intrinsic::aarch64_sve_st1_scatter:
27566 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
27567 case Intrinsic::aarch64_sve_st1_scatter_index:
27568 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
27569 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
27570 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
27571 /*OnlyPackedOffsets=*/false);
27572 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
27573 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
27574 /*OnlyPackedOffsets=*/false);
27575 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
27576 return performScatterStoreCombine(N, DAG,
27577 AArch64ISD::SST1_SXTW_SCALED_PRED,
27578 /*OnlyPackedOffsets=*/false);
27579 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
27580 return performScatterStoreCombine(N, DAG,
27581 AArch64ISD::SST1_UXTW_SCALED_PRED,
27582 /*OnlyPackedOffsets=*/false);
27583 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
27584 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
27585 case Intrinsic::aarch64_rndr:
27586 case Intrinsic::aarch64_rndrrs:
27587 return performRNDRCombine(N, DAG);
27588 case Intrinsic::aarch64_sme_ldr_zt:
27589 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
27590 DAG.getVTList(MVT::Other), N->getOperand(0),
27591 N->getOperand(2), N->getOperand(3));
27592 case Intrinsic::aarch64_sme_str_zt:
27593 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
27594 DAG.getVTList(MVT::Other), N->getOperand(0),
27595 N->getOperand(2), N->getOperand(3));
27596 default:
27597 break;
27598 }
27599 break;
27600 case ISD::GlobalAddress:
27601 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
27602 case ISD::CTLZ:
27603 return performCTLZCombine(N, DAG, Subtarget);
27605 return performScalarToVectorCombine(N, DCI, DAG);
27606 case ISD::SHL:
27607 return performSHLCombine(N, DCI, DAG);
27608 }
27609 return SDValue();
27610}
27611
27612// Check if the return value is used as only a return value, as otherwise
27613// we can't perform a tail-call. In particular, we need to check for
27614// target ISD nodes that are returns and any other "odd" constructs
27615// that the generic analysis code won't necessarily catch.
27616bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
27617 SDValue &Chain) const {
27618 if (N->getNumValues() != 1)
27619 return false;
27620 if (!N->hasNUsesOfValue(1, 0))
27621 return false;
27622
27623 SDValue TCChain = Chain;
27624 SDNode *Copy = *N->user_begin();
27625 if (Copy->getOpcode() == ISD::CopyToReg) {
27626 // If the copy has a glue operand, we conservatively assume it isn't safe to
27627 // perform a tail call.
27628 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
27629 MVT::Glue)
27630 return false;
27631 TCChain = Copy->getOperand(0);
27632 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
27633 return false;
27634
27635 bool HasRet = false;
27636 for (SDNode *Node : Copy->users()) {
27637 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
27638 return false;
27639 HasRet = true;
27640 }
27641
27642 if (!HasRet)
27643 return false;
27644
27645 Chain = TCChain;
27646 return true;
27647}
27648
27649// Return whether the an instruction can potentially be optimized to a tail
27650// call. This will cause the optimizers to attempt to move, or duplicate,
27651// return instructions to help enable tail call optimizations for this
27652// instruction.
27653bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
27654 return CI->isTailCall();
27655}
27656
27657bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
27658 Register Offset, bool IsPre,
27659 MachineRegisterInfo &MRI) const {
27660 auto CstOffset = getIConstantVRegVal(Offset, MRI);
27661 if (!CstOffset || CstOffset->isZero())
27662 return false;
27663
27664 // All of the indexed addressing mode instructions take a signed 9 bit
27665 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
27666 // encodes the sign/indexing direction.
27667 return isInt<9>(CstOffset->getSExtValue());
27668}
27669
27670bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
27671 SDValue &Base,
27672 SDValue &Offset,
27673 SelectionDAG &DAG) const {
27674 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
27675 return false;
27676
27677 // Non-null if there is exactly one user of the loaded value (ignoring chain).
27678 SDNode *ValOnlyUser = nullptr;
27679 for (SDUse &U : N->uses()) {
27680 if (U.getResNo() == 1)
27681 continue; // Ignore chain.
27682 if (ValOnlyUser == nullptr)
27683 ValOnlyUser = U.getUser();
27684 else {
27685 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
27686 break;
27687 }
27688 }
27689
27690 auto IsUndefOrZero = [](SDValue V) {
27691 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
27692 };
27693
27694 // If the only user of the value is a scalable vector splat, it is
27695 // preferable to do a replicating load (ld1r*).
27696 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
27697 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
27698 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
27699 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
27700 return false;
27701
27702 Base = Op->getOperand(0);
27703 // All of the indexed addressing mode instructions take a signed
27704 // 9 bit immediate offset.
27705 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
27706 int64_t RHSC = RHS->getSExtValue();
27707 if (Op->getOpcode() == ISD::SUB)
27708 RHSC = -(uint64_t)RHSC;
27709 if (!isInt<9>(RHSC))
27710 return false;
27711 // When big-endian VLD1/VST1 are used for vector load and store, and these
27712 // only allow an offset that's equal to the store size.
27713 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
27714 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
27715 (uint64_t)RHSC != MemType.getStoreSize())
27716 return false;
27717 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
27718 // when dealing with subtraction.
27719 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
27720 return true;
27721 }
27722 return false;
27723}
27724
27725bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
27726 SDValue &Offset,
27728 SelectionDAG &DAG) const {
27729 EVT VT;
27730 SDValue Ptr;
27731 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
27732 VT = LD->getMemoryVT();
27733 Ptr = LD->getBasePtr();
27734 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
27735 VT = ST->getMemoryVT();
27736 Ptr = ST->getBasePtr();
27737 } else
27738 return false;
27739
27740 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
27741 return false;
27742 AM = ISD::PRE_INC;
27743 return true;
27744}
27745
27746bool AArch64TargetLowering::getPostIndexedAddressParts(
27748 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
27749 EVT VT;
27750 SDValue Ptr;
27751 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
27752 VT = LD->getMemoryVT();
27753 Ptr = LD->getBasePtr();
27754 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
27755 VT = ST->getMemoryVT();
27756 Ptr = ST->getBasePtr();
27757 } else
27758 return false;
27759
27760 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
27761 return false;
27762 // Post-indexing updates the base, so it's not a valid transform
27763 // if that's not the same as the load's pointer.
27764 if (Ptr != Base)
27765 return false;
27766 AM = ISD::POST_INC;
27767 return true;
27768}
27769
27772 SelectionDAG &DAG) {
27773 SDLoc DL(N);
27774 SDValue Op = N->getOperand(0);
27775 EVT VT = N->getValueType(0);
27776 [[maybe_unused]] EVT SrcVT = Op.getValueType();
27777 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27778 "Must be bool vector.");
27779
27780 // Special handling for Clang's __builtin_convertvector. For vectors with <8
27781 // elements, it adds a vector concatenation with undef(s). If we encounter
27782 // this here, we can skip the concat.
27783 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
27784 bool AllUndef = true;
27785 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
27786 AllUndef &= Op.getOperand(I).isUndef();
27787
27788 if (AllUndef)
27789 Op = Op.getOperand(0);
27790 }
27791
27792 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
27793 if (VectorBits)
27794 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
27795}
27796
27799 SelectionDAG &DAG, EVT ExtendVT,
27800 EVT CastVT) {
27801 SDLoc DL(N);
27802 SDValue Op = N->getOperand(0);
27803 EVT VT = N->getValueType(0);
27804
27805 // Use SCALAR_TO_VECTOR for lane zero
27806 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
27807 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
27808 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
27809 Results.push_back(
27810 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
27811}
27812
27813void AArch64TargetLowering::ReplaceBITCASTResults(
27815 SDLoc DL(N);
27816 SDValue Op = N->getOperand(0);
27817 EVT VT = N->getValueType(0);
27818 EVT SrcVT = Op.getValueType();
27819
27820 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
27821 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
27822 return;
27823 }
27824
27825 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
27826 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
27827 return;
27828 }
27829
27830 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
27831 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
27832 return;
27833 }
27834
27835 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
27836 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
27837 "Expected fp->int bitcast!");
27838
27839 // Bitcasting between unpacked vector types of different element counts is
27840 // not a NOP because the live elements are laid out differently.
27841 // 01234567
27842 // e.g. nxv2i32 = XX??XX??
27843 // nxv4f16 = X?X?X?X?
27844 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
27845 return;
27846
27847 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
27848 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
27849 return;
27850 }
27851
27852 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27853 !VT.isVector())
27854 return replaceBoolVectorBitcast(N, Results, DAG);
27855
27856 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
27857 return;
27858
27859 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
27860 DAG.getUNDEF(MVT::i32), Op);
27861 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
27862 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
27863}
27864
27866 SelectionDAG &DAG,
27867 const AArch64Subtarget *Subtarget) {
27868 EVT VT = N->getValueType(0);
27869 if (!VT.is256BitVector() ||
27871 !N->getFlags().hasAllowReassociation()) ||
27872 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
27873 VT.getScalarType() == MVT::bf16)
27874 return;
27875
27876 SDValue X = N->getOperand(0);
27877 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
27878 if (!Shuf) {
27879 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
27880 X = N->getOperand(1);
27881 if (!Shuf)
27882 return;
27883 }
27884
27885 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
27886 return;
27887
27888 // Check the mask is 1,0,3,2,5,4,...
27889 ArrayRef<int> Mask = Shuf->getMask();
27890 for (int I = 0, E = Mask.size(); I < E; I++)
27891 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
27892 return;
27893
27894 SDLoc DL(N);
27895 auto LoHi = DAG.SplitVector(X, DL);
27896 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
27897 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
27898 LoHi.first, LoHi.second);
27899
27900 // Shuffle the elements back into order.
27901 SmallVector<int> NMask;
27902 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
27903 NMask.push_back(I);
27904 NMask.push_back(I);
27905 }
27906 Results.push_back(
27907 DAG.getVectorShuffle(VT, DL,
27908 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
27909 DAG.getUNDEF(LoHi.first.getValueType())),
27910 DAG.getUNDEF(VT), NMask));
27911}
27912
27915 SelectionDAG &DAG, unsigned InterOp,
27916 unsigned AcrossOp) {
27917 EVT LoVT, HiVT;
27918 SDValue Lo, Hi;
27919 SDLoc DL(N);
27920 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
27921 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
27922 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
27923 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
27924 Results.push_back(SplitVal);
27925}
27926
27927void AArch64TargetLowering::ReplaceExtractSubVectorResults(
27929 SDValue In = N->getOperand(0);
27930 EVT InVT = In.getValueType();
27931
27932 // Common code will handle these just fine.
27933 if (!InVT.isScalableVector() || !InVT.isInteger())
27934 return;
27935
27936 SDLoc DL(N);
27937 EVT VT = N->getValueType(0);
27938
27939 // The following checks bail if this is not a halving operation.
27940
27942
27943 if (InVT.getVectorElementCount() != (ResEC * 2))
27944 return;
27945
27946 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
27947 if (!CIndex)
27948 return;
27949
27950 unsigned Index = CIndex->getZExtValue();
27951 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
27952 return;
27953
27954 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
27955 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27956
27957 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
27958 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
27959}
27960
27961void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
27963 assert((Subtarget->hasSVE2p1() ||
27964 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
27965 "Custom lower of get.active.lane.mask missing required feature.");
27966
27967 assert(N->getValueType(0) == MVT::nxv32i1 &&
27968 "Unexpected result type for get.active.lane.mask");
27969
27970 SDLoc DL(N);
27971 SDValue Idx = N->getOperand(0);
27972 SDValue TC = N->getOperand(1);
27973
27974 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
27975 "Unexpected operand type for get.active.lane.mask");
27976
27977 if (Idx.getValueType() != MVT::i64) {
27978 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
27979 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
27980 }
27981
27982 SDValue ID =
27983 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
27984 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
27985 auto WideMask =
27986 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
27987
27988 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
27989 {WideMask.getValue(0), WideMask.getValue(1)}));
27990}
27991
27992// Create an even/odd pair of X registers holding integer value V.
27994 SDLoc DL(V.getNode());
27995 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
27996 if (DAG.getDataLayout().isBigEndian())
27997 std::swap (VLo, VHi);
27998 SDValue RegClass =
27999 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
28000 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
28001 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
28002 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
28003 return SDValue(
28004 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
28005}
28006
28009 SelectionDAG &DAG,
28010 const AArch64Subtarget *Subtarget) {
28011 assert(N->getValueType(0) == MVT::i128 &&
28012 "AtomicCmpSwap on types less than 128 should be legal");
28013
28014 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28015 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
28016 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
28017 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
28018 SDValue Ops[] = {
28019 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
28020 createGPRPairNode(DAG, N->getOperand(3)), // Store value
28021 N->getOperand(1), // Ptr
28022 N->getOperand(0), // Chain in
28023 };
28024
28025 unsigned Opcode;
28026 switch (MemOp->getMergedOrdering()) {
28028 Opcode = AArch64::CASPX;
28029 break;
28031 Opcode = AArch64::CASPAX;
28032 break;
28034 Opcode = AArch64::CASPLX;
28035 break;
28038 Opcode = AArch64::CASPALX;
28039 break;
28040 default:
28041 llvm_unreachable("Unexpected ordering!");
28042 }
28043
28044 MachineSDNode *CmpSwap = DAG.getMachineNode(
28045 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
28046 DAG.setNodeMemRefs(CmpSwap, {MemOp});
28047
28048 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
28049 if (DAG.getDataLayout().isBigEndian())
28050 std::swap(SubReg1, SubReg2);
28051 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
28052 SDValue(CmpSwap, 0));
28053 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
28054 SDValue(CmpSwap, 0));
28055 Results.push_back(
28056 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28057 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
28058 return;
28059 }
28060
28061 unsigned Opcode;
28062 switch (MemOp->getMergedOrdering()) {
28064 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
28065 break;
28067 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
28068 break;
28070 Opcode = AArch64::CMP_SWAP_128_RELEASE;
28071 break;
28074 Opcode = AArch64::CMP_SWAP_128;
28075 break;
28076 default:
28077 llvm_unreachable("Unexpected ordering!");
28078 }
28079
28080 SDLoc DL(N);
28081 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
28082 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
28083 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
28084 New.first, New.second, N->getOperand(0)};
28085 SDNode *CmpSwap = DAG.getMachineNode(
28086 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
28087 Ops);
28088 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
28089
28090 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28091 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
28092 Results.push_back(SDValue(CmpSwap, 3));
28093}
28094
28095static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
28096 AtomicOrdering Ordering) {
28097 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
28098 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
28099 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
28100 // ATOMIC_LOAD_CLR at any point.
28101 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
28102 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
28103 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
28104 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
28105
28106 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28107 // The operand will need to be XORed in a separate step.
28108 switch (Ordering) {
28110 return AArch64::LDCLRP;
28111 break;
28113 return AArch64::LDCLRPA;
28114 break;
28116 return AArch64::LDCLRPL;
28117 break;
28120 return AArch64::LDCLRPAL;
28121 break;
28122 default:
28123 llvm_unreachable("Unexpected ordering!");
28124 }
28125 }
28126
28127 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
28128 switch (Ordering) {
28130 return AArch64::LDSETP;
28131 break;
28133 return AArch64::LDSETPA;
28134 break;
28136 return AArch64::LDSETPL;
28137 break;
28140 return AArch64::LDSETPAL;
28141 break;
28142 default:
28143 llvm_unreachable("Unexpected ordering!");
28144 }
28145 }
28146
28147 if (ISDOpcode == ISD::ATOMIC_SWAP) {
28148 switch (Ordering) {
28150 return AArch64::SWPP;
28151 break;
28153 return AArch64::SWPPA;
28154 break;
28156 return AArch64::SWPPL;
28157 break;
28160 return AArch64::SWPPAL;
28161 break;
28162 default:
28163 llvm_unreachable("Unexpected ordering!");
28164 }
28165 }
28166
28167 llvm_unreachable("Unexpected ISDOpcode!");
28168}
28169
28172 SelectionDAG &DAG,
28173 const AArch64Subtarget *Subtarget) {
28174 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
28175 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
28176 // rather than the CASP instructions, because CASP has register classes for
28177 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
28178 // to present them as single operands. LSE128 instructions use the GPR64
28179 // register class (because the pair does not have to be sequential), like
28180 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
28181
28182 assert(N->getValueType(0) == MVT::i128 &&
28183 "AtomicLoadXXX on types less than 128 should be legal");
28184
28185 if (!Subtarget->hasLSE128())
28186 return;
28187
28188 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28189 const SDValue &Chain = N->getOperand(0);
28190 const SDValue &Ptr = N->getOperand(1);
28191 const SDValue &Val128 = N->getOperand(2);
28192 std::pair<SDValue, SDValue> Val2x64 =
28193 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
28194
28195 const unsigned ISDOpcode = N->getOpcode();
28196 const unsigned MachineOpcode =
28197 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
28198
28199 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28200 SDLoc DL(Val128);
28201 Val2x64.first =
28202 DAG.getNode(ISD::XOR, DL, MVT::i64,
28203 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
28204 Val2x64.second =
28205 DAG.getNode(ISD::XOR, DL, MVT::i64,
28206 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
28207 }
28208
28209 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
28210 if (DAG.getDataLayout().isBigEndian())
28211 std::swap(Ops[0], Ops[1]);
28212
28213 MachineSDNode *AtomicInst =
28214 DAG.getMachineNode(MachineOpcode, SDLoc(N),
28215 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
28216
28217 DAG.setNodeMemRefs(AtomicInst, {MemOp});
28218
28219 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
28220 if (DAG.getDataLayout().isBigEndian())
28221 std::swap(Lo, Hi);
28222
28223 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28224 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
28225}
28226
28227void AArch64TargetLowering::ReplaceNodeResults(
28229 switch (N->getOpcode()) {
28230 default:
28231 llvm_unreachable("Don't know how to custom expand this");
28232 case ISD::BITCAST:
28233 ReplaceBITCASTResults(N, Results, DAG);
28234 return;
28235 case ISD::VECREDUCE_ADD:
28240 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
28241 return;
28243 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
28244 Results.push_back(Res);
28245 return;
28246 case ISD::ADD:
28247 case ISD::FADD:
28248 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
28249 return;
28250
28251 case ISD::CTPOP:
28252 case ISD::PARITY:
28253 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
28254 Results.push_back(Result);
28255 return;
28256 case AArch64ISD::SADDV:
28257 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
28258 return;
28259 case AArch64ISD::UADDV:
28260 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
28261 return;
28262 case AArch64ISD::SMINV:
28263 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
28264 return;
28265 case AArch64ISD::UMINV:
28266 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
28267 return;
28268 case AArch64ISD::SMAXV:
28269 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
28270 return;
28271 case AArch64ISD::UMAXV:
28272 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
28273 return;
28274 case ISD::MULHS:
28276 Results.push_back(
28277 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
28278 return;
28279 case ISD::MULHU:
28281 Results.push_back(
28282 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
28283 return;
28284 case ISD::FP_TO_UINT:
28285 case ISD::FP_TO_SINT:
28288 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
28289 // Let normal code take care of it by not adding anything to Results.
28290 return;
28292 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
28293 return;
28295 assert(N->getValueType(0) != MVT::i128 &&
28296 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
28297 break;
28300 case ISD::ATOMIC_SWAP: {
28301 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
28302 "Expected 128-bit atomicrmw.");
28303 // These need custom type legalisation so we go directly to instruction.
28304 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
28305 return;
28306 }
28307 case ISD::ADDRSPACECAST: {
28308 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
28309 Results.push_back(V);
28310 return;
28311 }
28312 case ISD::ATOMIC_LOAD:
28313 case ISD::LOAD: {
28314 MemSDNode *LoadNode = cast<MemSDNode>(N);
28315 EVT MemVT = LoadNode->getMemoryVT();
28316 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
28317 // targets.
28318 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
28319 MemVT.getSizeInBits() == 256u &&
28320 (MemVT.getScalarSizeInBits() == 8u ||
28321 MemVT.getScalarSizeInBits() == 16u ||
28322 MemVT.getScalarSizeInBits() == 32u ||
28323 MemVT.getScalarSizeInBits() == 64u)) {
28324
28325 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
28327 AArch64ISD::LDNP, SDLoc(N),
28328 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
28329 {LoadNode->getChain(), LoadNode->getBasePtr()},
28330 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28331
28332 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
28333 DAG.getBitcast(HalfVT, Result.getValue(0)),
28334 DAG.getBitcast(HalfVT, Result.getValue(1)));
28335 Results.append({Pair, Result.getValue(2) /* Chain */});
28336 return;
28337 }
28338
28339 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
28340 LoadNode->getMemoryVT() != MVT::i128) {
28341 // Non-volatile or atomic loads are optimized later in AArch64's load/store
28342 // optimizer.
28343 return;
28344 }
28345
28346 if (SDValue(N, 0).getValueType() == MVT::i128) {
28347 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
28348 bool isLoadAcquire =
28350 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
28351
28352 if (isLoadAcquire)
28353 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
28354
28356 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28357 {LoadNode->getChain(), LoadNode->getBasePtr()},
28358 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28359
28360 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
28361
28362 SDValue Pair =
28363 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
28364 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
28365 Results.append({Pair, Result.getValue(2) /* Chain */});
28366 }
28367 return;
28368 }
28370 ReplaceExtractSubVectorResults(N, Results, DAG);
28371 return;
28374 // Custom lowering has been requested for INSERT_SUBVECTOR and
28375 // CONCAT_VECTORS -- but delegate to common code for result type
28376 // legalisation
28377 return;
28379 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
28380 return;
28382 EVT VT = N->getValueType(0);
28383
28384 Intrinsic::ID IntID =
28385 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
28386 switch (IntID) {
28387 default:
28388 return;
28389 case Intrinsic::aarch64_sve_clasta_n: {
28390 assert((VT == MVT::i8 || VT == MVT::i16) &&
28391 "custom lowering for unexpected type");
28392 SDLoc DL(N);
28393 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28394 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
28395 N->getOperand(1), Op2, N->getOperand(3));
28396 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28397 return;
28398 }
28399 case Intrinsic::aarch64_sve_clastb_n: {
28400 assert((VT == MVT::i8 || VT == MVT::i16) &&
28401 "custom lowering for unexpected type");
28402 SDLoc DL(N);
28403 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28404 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
28405 N->getOperand(1), Op2, N->getOperand(3));
28406 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28407 return;
28408 }
28409 case Intrinsic::aarch64_sve_lasta: {
28410 assert((VT == MVT::i8 || VT == MVT::i16) &&
28411 "custom lowering for unexpected type");
28412 SDLoc DL(N);
28413 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
28414 N->getOperand(1), N->getOperand(2));
28415 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28416 return;
28417 }
28418 case Intrinsic::aarch64_sve_lastb: {
28419 assert((VT == MVT::i8 || VT == MVT::i16) &&
28420 "custom lowering for unexpected type");
28421 SDLoc DL(N);
28422 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
28423 N->getOperand(1), N->getOperand(2));
28424 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28425 return;
28426 }
28427 case Intrinsic::aarch64_sme_in_streaming_mode: {
28428 SDLoc DL(N);
28429 SDValue Chain = DAG.getEntryNode();
28430
28431 SDValue RuntimePStateSM =
28432 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
28433 Results.push_back(
28434 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
28435 return;
28436 }
28437 case Intrinsic::experimental_vector_match: {
28438 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
28439 return;
28440
28441 // NOTE: Only trivial type promotion is supported.
28442 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
28443 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
28444 return;
28445
28446 SDLoc DL(N);
28447 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
28448 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28449 return;
28450 }
28451 }
28452 }
28453 case ISD::READ_REGISTER: {
28454 SDLoc DL(N);
28455 assert(N->getValueType(0) == MVT::i128 &&
28456 "READ_REGISTER custom lowering is only for 128-bit sysregs");
28457 SDValue Chain = N->getOperand(0);
28458 SDValue SysRegName = N->getOperand(1);
28459
28460 SDValue Result = DAG.getNode(
28461 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28462 Chain, SysRegName);
28463
28464 // Sysregs are not endian. Result.getValue(0) always contains the lower half
28465 // of the 128-bit System Register value.
28466 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28467 Result.getValue(0), Result.getValue(1));
28468 Results.push_back(Pair);
28469 Results.push_back(Result.getValue(2)); // Chain
28470 return;
28471 }
28472 }
28473}
28474
28476 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
28478 return true;
28479}
28480
28481unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
28482 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
28483 // reciprocal if there are three or more FDIVs.
28484 return 3;
28485}
28486
28489 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
28490 // v4i16, v2i32 instead of to promote.
28491 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
28492 VT == MVT::v1f32)
28493 return TypeWidenVector;
28494
28496}
28497
28498// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
28499// provided the address is 16-byte aligned.
28501 if (!Subtarget->hasLSE2())
28502 return false;
28503
28504 if (auto LI = dyn_cast<LoadInst>(I))
28505 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28506 LI->getAlign() >= Align(16);
28507
28508 if (auto SI = dyn_cast<StoreInst>(I))
28509 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28510 SI->getAlign() >= Align(16);
28511
28512 return false;
28513}
28514
28516 if (!Subtarget->hasLSE128())
28517 return false;
28518
28519 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
28520 // will clobber the two registers.
28521 if (const auto *SI = dyn_cast<StoreInst>(I))
28522 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28523 SI->getAlign() >= Align(16) &&
28524 (SI->getOrdering() == AtomicOrdering::Release ||
28525 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
28526
28527 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
28528 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28529 RMW->getAlign() >= Align(16) &&
28530 (RMW->getOperation() == AtomicRMWInst::Xchg ||
28531 RMW->getOperation() == AtomicRMWInst::And ||
28532 RMW->getOperation() == AtomicRMWInst::Or);
28533
28534 return false;
28535}
28536
28538 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
28539 return false;
28540
28541 if (auto LI = dyn_cast<LoadInst>(I))
28542 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28543 LI->getAlign() >= Align(16) &&
28544 LI->getOrdering() == AtomicOrdering::Acquire;
28545
28546 if (auto SI = dyn_cast<StoreInst>(I))
28547 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28548 SI->getAlign() >= Align(16) &&
28549 SI->getOrdering() == AtomicOrdering::Release;
28550
28551 return false;
28552}
28553
28555 const Instruction *I) const {
28557 return false;
28559 return false;
28561 return true;
28562 return false;
28563}
28564
28566 const Instruction *I) const {
28567 // Store-Release instructions only provide seq_cst guarantees when paired with
28568 // Load-Acquire instructions. MSVC CRT does not use these instructions to
28569 // implement seq_cst loads and stores, so we need additional explicit fences
28570 // after memory writes.
28571 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28572 return false;
28573
28574 switch (I->getOpcode()) {
28575 default:
28576 return false;
28577 case Instruction::AtomicCmpXchg:
28578 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
28580 case Instruction::AtomicRMW:
28581 return cast<AtomicRMWInst>(I)->getOrdering() ==
28583 case Instruction::Store:
28584 return cast<StoreInst>(I)->getOrdering() ==
28586 }
28587}
28588
28589// Loads and stores less than 128-bits are already atomic; ones above that
28590// are doomed anyway, so defer to the default libcall and blame the OS when
28591// things go wrong.
28594 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
28595 if (Size != 128)
28597 if (isOpSuitableForRCPC3(SI))
28599 if (isOpSuitableForLSE128(SI))
28601 if (isOpSuitableForLDPSTP(SI))
28604}
28605
28606// Loads and stores less than 128-bits are already atomic; ones above that
28607// are doomed anyway, so defer to the default libcall and blame the OS when
28608// things go wrong.
28611 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
28612
28613 if (Size != 128)
28615 if (isOpSuitableForRCPC3(LI))
28617 // No LSE128 loads
28618 if (isOpSuitableForLDPSTP(LI))
28620
28621 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28622 // implement atomicrmw without spilling. If the target address is also on the
28623 // stack and close enough to the spill slot, this can lead to a situation
28624 // where the monitor always gets cleared and the atomic operation can never
28625 // succeed. So at -O0 lower this operation to a CAS loop.
28626 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28628
28629 // Using CAS for an atomic load has a better chance of succeeding under high
28630 // contention situations. So use it if available.
28631 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
28633}
28634
28635// Return true if the atomic operation expansion will lower to use a library
28636// call, and is thus ineligible to use an LLSC expansion.
28637static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
28638 const AtomicRMWInst *RMW) {
28639 if (!RMW->isFloatingPointOperation())
28640 return false;
28641 switch (RMW->getType()->getScalarType()->getTypeID()) {
28642 case Type::FloatTyID:
28643 case Type::DoubleTyID:
28644 case Type::HalfTyID:
28645 case Type::BFloatTyID:
28646 // Will use soft float
28647 return !Subtarget.hasFPARMv8();
28648 default:
28649 // fp128 will emit library calls.
28650 return true;
28651 }
28652
28653 llvm_unreachable("covered type switch");
28654}
28655
28656// The "default" for integer RMW operations is to expand to an LL/SC loop.
28657// However, with the LSE instructions (or outline-atomics mode, which provides
28658// library routines in place of the LSE-instructions), we can directly emit many
28659// operations instead.
28662 Type *Ty = AI->getType();
28663 unsigned Size = Ty->getPrimitiveSizeInBits();
28664 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
28665
28666 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
28670 if (CanUseLSE128)
28672
28673 // If LSFE available, use atomic FP instructions in preference to expansion
28674 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
28680
28681 // Nand is not supported in LSE.
28682 // Leave 128 bits to LLSC or CmpXChg.
28683 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
28684 !AI->isFloatingPointOperation()) {
28685 if (Subtarget->hasLSE())
28687 if (Subtarget->outlineAtomics()) {
28688 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
28689 // Don't outline them unless
28690 // (1) high level <atomic> support approved:
28691 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
28692 // (2) low level libgcc and compiler-rt support implemented by:
28693 // min/max outline atomics helpers
28694 if (AI->getOperation() != AtomicRMWInst::Min &&
28699 }
28700 }
28701 }
28702
28703 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28704 // implement atomicrmw without spilling. If the target address is also on the
28705 // stack and close enough to the spill slot, this can lead to a situation
28706 // where the monitor always gets cleared and the atomic operation can never
28707 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
28708 // we have a single CAS instruction that can replace the loop.
28710 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
28712
28714}
28715
28718 AtomicCmpXchgInst *AI) const {
28719 // If subtarget has LSE, leave cmpxchg intact for codegen.
28720 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
28722 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28723 // implement cmpxchg without spilling. If the address being exchanged is also
28724 // on the stack and close enough to the spill slot, this can lead to a
28725 // situation where the monitor always gets cleared and the atomic operation
28726 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
28727 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28729
28730 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
28731 // it.
28733 if (Size > 64)
28735
28737}
28738
28740 Type *ValueTy, Value *Addr,
28741 AtomicOrdering Ord) const {
28742 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28743 bool IsAcquire = isAcquireOrStronger(Ord);
28744
28745 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
28746 // intrinsic must return {i64, i64} and we have to recombine them into a
28747 // single i128 here.
28748 if (ValueTy->getPrimitiveSizeInBits() == 128) {
28750 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
28751
28752 Value *LoHi =
28753 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
28754
28755 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
28756 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
28757
28758 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
28759 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
28760 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
28761
28762 Value *Or = Builder.CreateOr(
28763 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
28764 return Builder.CreateBitCast(Or, ValueTy);
28765 }
28766
28767 Type *Tys[] = { Addr->getType() };
28769 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
28770
28771 const DataLayout &DL = M->getDataLayout();
28772 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
28773 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
28774 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
28775 Attribute::ElementType, IntEltTy));
28776 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
28777
28778 return Builder.CreateBitCast(Trunc, ValueTy);
28779}
28780
28782 IRBuilderBase &Builder) const {
28783 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
28784}
28785
28787 Value *Val, Value *Addr,
28788 AtomicOrdering Ord) const {
28789 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28790 bool IsRelease = isReleaseOrStronger(Ord);
28791
28792 // Since the intrinsics must have legal type, the i128 intrinsics take two
28793 // parameters: "i64, i64". We must marshal Val into the appropriate form
28794 // before the call.
28795 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
28797 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
28799 Type *Int64Ty = Type::getInt64Ty(M->getContext());
28800 Type *Int128Ty = Type::getInt128Ty(M->getContext());
28801
28802 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
28803
28804 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
28805 Value *Hi =
28806 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
28807 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
28808 }
28809
28811 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
28812 Type *Tys[] = { Addr->getType() };
28814
28815 const DataLayout &DL = M->getDataLayout();
28816 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
28817 Val = Builder.CreateBitCast(Val, IntValTy);
28818
28819 CallInst *CI = Builder.CreateCall(
28820 Stxr, {Builder.CreateZExtOrBitCast(
28821 Val, Stxr->getFunctionType()->getParamType(0)),
28822 Addr});
28823 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
28824 Attribute::ElementType, Val->getType()));
28825 return CI;
28826}
28827
28829 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
28830 const DataLayout &DL) const {
28831 if (!Ty->isArrayTy()) {
28832 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
28833 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
28834 }
28835
28836 // All non aggregate members of the type must have the same type
28837 SmallVector<EVT> ValueVTs;
28838 ComputeValueVTs(*this, DL, Ty, ValueVTs);
28839 return all_equal(ValueVTs);
28840}
28841
28842bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
28843 EVT) const {
28844 return false;
28845}
28846
28847static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
28848 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
28849 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
28850 M, Intrinsic::thread_pointer, IRB.getPtrTy());
28851 return IRB.CreatePointerCast(
28852 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
28853 Offset),
28854 IRB.getPtrTy(0));
28855}
28856
28858 // Android provides a fixed TLS slot for the stack cookie. See the definition
28859 // of TLS_SLOT_STACK_GUARD in
28860 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
28861 if (Subtarget->isTargetAndroid())
28862 return UseTlsOffset(IRB, 0x28);
28863
28864 // Fuchsia is similar.
28865 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
28866 if (Subtarget->isTargetFuchsia())
28867 return UseTlsOffset(IRB, -0x10);
28868
28870}
28871
28873 // MSVC CRT provides functionalities for stack protection.
28874 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
28875 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
28876
28877 RTLIB::LibcallImpl SecurityCookieVar =
28878 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
28879 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
28880 SecurityCookieVar != RTLIB::Unsupported) {
28881 // MSVC CRT has a global variable holding security cookie.
28882 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
28883 PointerType::getUnqual(M.getContext()));
28884
28885 // MSVC CRT has a function to validate security cookie.
28886 FunctionCallee SecurityCheckCookie =
28887 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
28888 Type::getVoidTy(M.getContext()),
28889 PointerType::getUnqual(M.getContext()));
28890 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
28891 F->setCallingConv(CallingConv::Win64);
28892 F->addParamAttr(0, Attribute::AttrKind::InReg);
28893 }
28894 return;
28895 }
28897}
28898
28900 // MSVC CRT has a function to validate security cookie.
28901 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
28902 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
28903 if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
28904 return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
28906}
28907
28908Value *
28910 // Android provides a fixed TLS slot for the SafeStack pointer. See the
28911 // definition of TLS_SLOT_SAFESTACK in
28912 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
28913 if (Subtarget->isTargetAndroid())
28914 return UseTlsOffset(IRB, 0x48);
28915
28916 // Fuchsia is similar.
28917 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
28918 if (Subtarget->isTargetFuchsia())
28919 return UseTlsOffset(IRB, -0x8);
28920
28922}
28923
28924/// If a physical register, this returns the register that receives the
28925/// exception address on entry to an EH pad.
28927 const Constant *PersonalityFn) const {
28928 // FIXME: This is a guess. Has this been defined yet?
28929 return AArch64::X0;
28930}
28931
28932/// If a physical register, this returns the register that receives the
28933/// exception typeid on entry to a landing pad.
28935 const Constant *PersonalityFn) const {
28936 // FIXME: This is a guess. Has this been defined yet?
28937 return AArch64::X1;
28938}
28939
28941 const Instruction &AndI) const {
28942 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
28943 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
28944 // may be beneficial to sink in other cases, but we would have to check that
28945 // the cmp would not get folded into the br to form a cbz for these to be
28946 // beneficial.
28947 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
28948 if (!Mask)
28949 return false;
28950 return Mask->getValue().isPowerOf2();
28951}
28952
28956 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
28957 SelectionDAG &DAG) const {
28958 // Does baseline recommend not to perform the fold by default?
28960 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
28961 return false;
28962 // Else, if this is a vector shift, prefer 'shl'.
28963 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
28964}
28965
28968 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
28970 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
28973 ExpansionFactor);
28974}
28975
28977 // Update IsSplitCSR in AArch64unctionInfo.
28978 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
28979 AFI->setIsSplitCSR(true);
28980}
28981
28983 MachineBasicBlock *Entry,
28984 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
28985 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
28986 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
28987 if (!IStart)
28988 return;
28989
28990 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28991 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28992 MachineBasicBlock::iterator MBBI = Entry->begin();
28993 for (const MCPhysReg *I = IStart; *I; ++I) {
28994 const TargetRegisterClass *RC = nullptr;
28995 if (AArch64::GPR64RegClass.contains(*I))
28996 RC = &AArch64::GPR64RegClass;
28997 else if (AArch64::FPR64RegClass.contains(*I))
28998 RC = &AArch64::FPR64RegClass;
28999 else
29000 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
29001
29002 Register NewVR = MRI->createVirtualRegister(RC);
29003 // Create copy from CSR to a virtual register.
29004 // FIXME: this currently does not emit CFI pseudo-instructions, it works
29005 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
29006 // nounwind. If we want to generalize this later, we may need to emit
29007 // CFI pseudo-instructions.
29008 assert(Entry->getParent()->getFunction().hasFnAttribute(
29009 Attribute::NoUnwind) &&
29010 "Function should be nounwind in insertCopiesSplitCSR!");
29011 Entry->addLiveIn(*I);
29012 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
29013 .addReg(*I);
29014
29015 // Insert the copy-back instructions right before the terminator.
29016 for (auto *Exit : Exits)
29017 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
29018 TII->get(TargetOpcode::COPY), *I)
29019 .addReg(NewVR);
29020 }
29021}
29022
29024 // Integer division on AArch64 is expensive. However, when aggressively
29025 // optimizing for code size, we prefer to use a div instruction, as it is
29026 // usually smaller than the alternative sequence.
29027 // The exception to this is vector division. Since AArch64 doesn't have vector
29028 // integer division, leaving the division as-is is a loss even in terms of
29029 // size, because it will have to be scalarized, while the alternative code
29030 // sequence can be performed in vector form.
29031 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
29032 return OptSize && !VT.isVector();
29033}
29034
29036 const MachineFunction &MF) const {
29037 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
29038 // In future, we could allow this when SVE is available, but currently,
29039 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
29040 // the general lowering may introduce stack spills/reloads).
29041 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
29042 return false;
29043
29044 // Do not merge to float value size (128 bytes) if no implicit float attribute
29045 // is set.
29046 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
29047 return !NoFloat || MemVT.getSizeInBits() <= 64;
29048}
29049
29051 // We want inc-of-add for scalars and sub-of-not for vectors.
29052 return VT.isScalarInteger();
29053}
29054
29056 EVT VT) const {
29057 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
29058 // legalize.
29059 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
29060 return false;
29061 if (FPVT == MVT::v8bf16)
29062 return false;
29063 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
29064}
29065
29067 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
29068 // avoid vselect becoming bsl / unrolling.
29069 return !VT.isFixedLengthVector();
29070}
29071
29075 const TargetInstrInfo *TII) const {
29076 assert(MBBI->isCall() && MBBI->getCFIType() &&
29077 "Invalid call instruction for a KCFI check");
29078
29079 switch (MBBI->getOpcode()) {
29080 case AArch64::BLR:
29081 case AArch64::BLRNoIP:
29082 case AArch64::TCRETURNri:
29083 case AArch64::TCRETURNrix16x17:
29084 case AArch64::TCRETURNrix17:
29085 case AArch64::TCRETURNrinotx16:
29086 break;
29087 default:
29088 llvm_unreachable("Unexpected CFI call opcode");
29089 }
29090
29091 MachineOperand &Target = MBBI->getOperand(0);
29092 assert(Target.isReg() && "Invalid target operand for an indirect call");
29093 Target.setIsRenamable(false);
29094
29095 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
29096 .addReg(Target.getReg())
29097 .addImm(MBBI->getCFIType())
29098 .getInstr();
29099}
29100
29102 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
29103}
29104
29105unsigned
29107 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
29108 return getPointerTy(DL).getSizeInBits();
29109
29110 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
29111}
29112
29113void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
29114 MachineFrameInfo &MFI = MF.getFrameInfo();
29115 // If we have any vulnerable SVE stack objects then the stack protector
29116 // needs to be placed at the top of the SVE stack area, as the SVE locals
29117 // are placed above the other locals, so we allocate it as if it were a
29118 // scalable vector.
29119 // FIXME: It may be worthwhile having a specific interface for this rather
29120 // than doing it here in finalizeLowering.
29121 if (MFI.hasStackProtectorIndex()) {
29122 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
29128 break;
29129 }
29130 }
29131 }
29134}
29135
29136// Unlike X86, we let frame lowering assign offsets to all catch objects.
29138
29139bool AArch64TargetLowering::shouldLocalize(
29140 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
29141 auto &MF = *MI.getMF();
29142 auto &MRI = MF.getRegInfo();
29143 auto maxUses = [](unsigned RematCost) {
29144 // A cost of 1 means remats are basically free.
29145 if (RematCost == 1)
29146 return std::numeric_limits<unsigned>::max();
29147 if (RematCost == 2)
29148 return 2U;
29149
29150 // Remat is too expensive, only sink if there's one user.
29151 if (RematCost > 2)
29152 return 1U;
29153 llvm_unreachable("Unexpected remat cost");
29154 };
29155
29156 unsigned Opc = MI.getOpcode();
29157 switch (Opc) {
29158 case TargetOpcode::G_GLOBAL_VALUE: {
29159 // On Darwin, TLS global vars get selected into function calls, which
29160 // we don't want localized, as they can get moved into the middle of a
29161 // another call sequence.
29162 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
29163 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
29164 return false;
29165 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
29166 }
29167 case TargetOpcode::G_FCONSTANT:
29168 case TargetOpcode::G_CONSTANT: {
29169 const ConstantInt *CI;
29170 unsigned AdditionalCost = 0;
29171
29172 if (Opc == TargetOpcode::G_CONSTANT)
29173 CI = MI.getOperand(1).getCImm();
29174 else {
29175 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
29176 // We try to estimate cost of 32/64b fpimms, as they'll likely be
29177 // materialized as integers.
29178 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
29179 break;
29180 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
29181 bool OptForSize = MF.getFunction().hasOptSize();
29183 OptForSize))
29184 return true; // Constant should be cheap.
29185 CI =
29186 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
29187 // FP materialization also costs an extra move, from gpr to fpr.
29188 AdditionalCost = 1;
29189 }
29190 APInt Imm = CI->getValue();
29193 assert(Cost.isValid() && "Expected a valid imm cost");
29194
29195 unsigned RematCost = Cost.getValue();
29196 RematCost += AdditionalCost;
29197 Register Reg = MI.getOperand(0).getReg();
29198 unsigned MaxUses = maxUses(RematCost);
29199 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
29200 if (MaxUses == std::numeric_limits<unsigned>::max())
29201 --MaxUses;
29202 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
29203 }
29204 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
29205 // localizable.
29206 case AArch64::ADRP:
29207 case AArch64::G_ADD_LOW:
29208 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
29209 case TargetOpcode::G_PTR_ADD:
29210 return true;
29211 default:
29212 break;
29213 }
29215}
29216
29218 // Fallback for scalable vectors.
29219 // Note that if EnableSVEGISel is true, we allow scalable vector types for
29220 // all instructions, regardless of whether they are actually supported.
29221 if (!EnableSVEGISel) {
29222 if (Inst.getType()->isScalableTy()) {
29223 return true;
29224 }
29225
29226 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
29227 if (Inst.getOperand(i)->getType()->isScalableTy())
29228 return true;
29229
29230 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
29231 if (AI->getAllocatedType()->isScalableTy())
29232 return true;
29233 }
29234 }
29235
29236 // Checks to allow the use of SME instructions
29237 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
29238 auto CallAttrs = SMECallAttrs(*Base, this);
29239 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
29240 CallAttrs.requiresPreservingZT0() ||
29241 CallAttrs.requiresPreservingAllZAState())
29242 return true;
29243 }
29244 return false;
29245}
29246
29247// Return the largest legal scalable vector type that matches VT's element type.
29251 "Expected legal fixed length vector!");
29252 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29253 default:
29254 llvm_unreachable("unexpected element type for SVE container");
29255 case MVT::i8:
29256 return EVT(MVT::nxv16i8);
29257 case MVT::i16:
29258 return EVT(MVT::nxv8i16);
29259 case MVT::i32:
29260 return EVT(MVT::nxv4i32);
29261 case MVT::i64:
29262 return EVT(MVT::nxv2i64);
29263 case MVT::bf16:
29264 return EVT(MVT::nxv8bf16);
29265 case MVT::f16:
29266 return EVT(MVT::nxv8f16);
29267 case MVT::f32:
29268 return EVT(MVT::nxv4f32);
29269 case MVT::f64:
29270 return EVT(MVT::nxv2f64);
29271 }
29272}
29273
29274// Return a predicate with active lanes corresponding to the extent of VT.
29276 EVT VT) {
29279 "Expected legal fixed length vector!");
29280
29281 std::optional<unsigned> PgPattern =
29283 assert(PgPattern && "Unexpected element count for SVE predicate");
29284
29285 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
29286 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
29287 // variants of instructions when available.
29288 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29289 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29290 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29291 if (MaxSVESize && MinSVESize == MaxSVESize &&
29292 MaxSVESize == VT.getSizeInBits())
29293 PgPattern = AArch64SVEPredPattern::all;
29294
29295 MVT MaskVT;
29296 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29297 default:
29298 llvm_unreachable("unexpected element type for SVE predicate");
29299 case MVT::i8:
29300 MaskVT = MVT::nxv16i1;
29301 break;
29302 case MVT::i16:
29303 case MVT::f16:
29304 case MVT::bf16:
29305 MaskVT = MVT::nxv8i1;
29306 break;
29307 case MVT::i32:
29308 case MVT::f32:
29309 MaskVT = MVT::nxv4i1;
29310 break;
29311 case MVT::i64:
29312 case MVT::f64:
29313 MaskVT = MVT::nxv2i1;
29314 break;
29315 }
29316
29317 return getPTrue(DAG, DL, MaskVT, *PgPattern);
29318}
29319
29321 EVT VT) {
29323 "Expected legal scalable vector!");
29324 auto PredTy = VT.changeVectorElementType(MVT::i1);
29325 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
29326}
29327
29329 if (VT.isFixedLengthVector())
29330 return getPredicateForFixedLengthVector(DAG, DL, VT);
29331
29332 return getPredicateForScalableVector(DAG, DL, VT);
29333}
29334
29335// Grow V to consume an entire SVE register.
29337 assert(VT.isScalableVector() &&
29338 "Expected to convert into a scalable vector!");
29339 assert(V.getValueType().isFixedLengthVector() &&
29340 "Expected a fixed length vector operand!");
29341 SDLoc DL(V);
29342 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29343 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
29344}
29345
29346// Shrink V so it's just big enough to maintain a VT's worth of data.
29349 "Expected to convert into a fixed length vector!");
29350 assert(V.getValueType().isScalableVector() &&
29351 "Expected a scalable vector operand!");
29352 SDLoc DL(V);
29353 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29354 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
29355}
29356
29357// Convert all fixed length vector loads larger than NEON to masked_loads.
29358SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
29359 SDValue Op, SelectionDAG &DAG) const {
29360 auto Load = cast<LoadSDNode>(Op);
29361
29362 SDLoc DL(Op);
29363 EVT VT = Op.getValueType();
29364 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29365 EVT LoadVT = ContainerVT;
29366 EVT MemVT = Load->getMemoryVT();
29367
29368 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29369
29370 if (VT.isFloatingPoint()) {
29371 LoadVT = ContainerVT.changeTypeToInteger();
29372 MemVT = MemVT.changeTypeToInteger();
29373 }
29374
29375 SDValue NewLoad = DAG.getMaskedLoad(
29376 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
29377 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
29378 Load->getAddressingMode(), Load->getExtensionType());
29379
29380 SDValue Result = NewLoad;
29381 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
29382 EVT ExtendVT = ContainerVT.changeVectorElementType(
29383 Load->getMemoryVT().getVectorElementType());
29384
29385 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
29386 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29387 Pg, Result, DAG.getUNDEF(ContainerVT));
29388 } else if (VT.isFloatingPoint()) {
29389 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
29390 }
29391
29392 Result = convertFromScalableVector(DAG, VT, Result);
29393 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29394 return DAG.getMergeValues(MergedValues, DL);
29395}
29396
29398 SelectionDAG &DAG) {
29399 SDLoc DL(Mask);
29400 EVT InVT = Mask.getValueType();
29401 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29403
29404 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
29405 return Pg;
29406
29407 bool InvertCond = false;
29408 if (isBitwiseNot(Mask)) {
29409 InvertCond = true;
29410 Mask = Mask.getOperand(0);
29411 }
29412
29413 SDValue Op1, Op2;
29414 ISD::CondCode CC;
29415
29416 // When Mask is the result of a SETCC, it's better to regenerate the compare.
29417 if (Mask.getOpcode() == ISD::SETCC) {
29418 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
29419 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
29420 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
29421 } else {
29422 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
29423 Op2 = DAG.getConstant(0, DL, ContainerVT);
29424 CC = ISD::SETNE;
29425 }
29426
29427 if (InvertCond)
29428 CC = getSetCCInverse(CC, Op1.getValueType());
29429
29430 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
29431 {Pg, Op1, Op2, DAG.getCondCode(CC)});
29432}
29433
29434// Convert all fixed length vector loads larger than NEON to masked_loads.
29435SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
29436 SDValue Op, SelectionDAG &DAG) const {
29437 auto Load = cast<MaskedLoadSDNode>(Op);
29438
29439 SDLoc DL(Op);
29440 EVT VT = Op.getValueType();
29441 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29442
29443 SDValue Mask = Load->getMask();
29444 // If this is an extending load and the mask type is not the same as
29445 // load's type then we have to extend the mask type.
29446 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
29447 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
29448 "Incorrect mask type");
29449 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
29450 }
29452
29453 SDValue PassThru;
29454 bool IsPassThruZeroOrUndef = false;
29455
29456 if (Load->getPassThru()->isUndef()) {
29457 PassThru = DAG.getUNDEF(ContainerVT);
29458 IsPassThruZeroOrUndef = true;
29459 } else {
29460 if (ContainerVT.isInteger())
29461 PassThru = DAG.getConstant(0, DL, ContainerVT);
29462 else
29463 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
29464 if (isZerosVector(Load->getPassThru().getNode()))
29465 IsPassThruZeroOrUndef = true;
29466 }
29467
29468 SDValue NewLoad = DAG.getMaskedLoad(
29469 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
29470 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
29471 Load->getAddressingMode(), Load->getExtensionType());
29472
29473 SDValue Result = NewLoad;
29474 if (!IsPassThruZeroOrUndef) {
29475 SDValue OldPassThru =
29476 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
29477 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
29478 }
29479
29480 Result = convertFromScalableVector(DAG, VT, Result);
29481 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29482 return DAG.getMergeValues(MergedValues, DL);
29483}
29484
29485// Convert all fixed length vector stores larger than NEON to masked_stores.
29486SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
29487 SDValue Op, SelectionDAG &DAG) const {
29488 auto Store = cast<StoreSDNode>(Op);
29489
29490 SDLoc DL(Op);
29491 EVT VT = Store->getValue().getValueType();
29492 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29493 EVT MemVT = Store->getMemoryVT();
29494
29495 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29496 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29497
29498 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
29499 EVT TruncVT = ContainerVT.changeVectorElementType(
29500 Store->getMemoryVT().getVectorElementType());
29501 MemVT = MemVT.changeTypeToInteger();
29502 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
29503 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
29504 DAG.getUNDEF(TruncVT));
29505 NewValue =
29506 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29507 } else if (VT.isFloatingPoint()) {
29508 MemVT = MemVT.changeTypeToInteger();
29509 NewValue =
29510 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29511 }
29512
29513 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
29514 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
29515 Store->getMemOperand(), Store->getAddressingMode(),
29516 Store->isTruncatingStore());
29517}
29518
29519SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
29520 SDValue Op, SelectionDAG &DAG) const {
29521 auto *Store = cast<MaskedStoreSDNode>(Op);
29522
29523 SDLoc DL(Op);
29524 EVT VT = Store->getValue().getValueType();
29525 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29526
29527 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29529
29530 return DAG.getMaskedStore(
29531 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
29532 Mask, Store->getMemoryVT(), Store->getMemOperand(),
29533 Store->getAddressingMode(), Store->isTruncatingStore());
29534}
29535
29536SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
29537 SDValue Op, SelectionDAG &DAG) const {
29538 SDLoc DL(Op);
29539 EVT VT = Op.getValueType();
29540 EVT EltVT = VT.getVectorElementType();
29541
29542 bool Signed = Op.getOpcode() == ISD::SDIV;
29543 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
29544
29545 bool Negated;
29546 uint64_t SplatVal;
29547 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
29548 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29549 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29550 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
29551
29553 SDValue Res =
29554 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
29555 if (Negated)
29556 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
29557 DAG.getConstant(0, DL, ContainerVT), Res);
29558
29559 return convertFromScalableVector(DAG, VT, Res);
29560 }
29561
29562 // Scalable vector i32/i64 DIV is supported.
29563 if (EltVT == MVT::i32 || EltVT == MVT::i64)
29564 return LowerToPredicatedOp(Op, DAG, PredOpcode);
29565
29566 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
29567 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
29568 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
29569 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29570
29571 // If the wider type is legal: extend, op, and truncate.
29572 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
29573 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
29574 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
29575 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
29576 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
29577 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
29578 }
29579
29580 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
29581 &ExtendOpcode](SDValue Op) {
29582 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
29583 SDValue IdxHalf =
29584 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
29585 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
29586 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
29587 return std::pair<SDValue, SDValue>(
29588 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
29589 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
29590 };
29591
29592 // If wider type is not legal: split, extend, op, trunc and concat.
29593 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
29594 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
29595 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
29596 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
29597 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
29598 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
29599 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
29600}
29601
29602SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
29603 SDValue Op, SelectionDAG &DAG) const {
29604 EVT VT = Op.getValueType();
29605 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29606
29607 SDLoc DL(Op);
29608 SDValue Val = Op.getOperand(0);
29609 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29610 Val = convertToScalableVector(DAG, ContainerVT, Val);
29611
29612 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
29613 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
29614
29615 // Repeatedly unpack Val until the result is of the desired element type.
29616 switch (ContainerVT.getSimpleVT().SimpleTy) {
29617 default:
29618 llvm_unreachable("unimplemented container type");
29619 case MVT::nxv16i8:
29620 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
29621 if (VT.getVectorElementType() == MVT::i16)
29622 break;
29623 [[fallthrough]];
29624 case MVT::nxv8i16:
29625 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
29626 if (VT.getVectorElementType() == MVT::i32)
29627 break;
29628 [[fallthrough]];
29629 case MVT::nxv4i32:
29630 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
29631 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
29632 break;
29633 }
29634
29635 return convertFromScalableVector(DAG, VT, Val);
29636}
29637
29638SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
29639 SDValue Op, SelectionDAG &DAG) const {
29640 EVT VT = Op.getValueType();
29641 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29642
29643 SDLoc DL(Op);
29644 SDValue Val = Op.getOperand(0);
29645 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29646 Val = convertToScalableVector(DAG, ContainerVT, Val);
29647
29648 // Repeatedly truncate Val until the result is of the desired element type.
29649 switch (ContainerVT.getSimpleVT().SimpleTy) {
29650 default:
29651 llvm_unreachable("unimplemented container type");
29652 case MVT::nxv2i64:
29653 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
29654 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
29655 if (VT.getVectorElementType() == MVT::i32)
29656 break;
29657 [[fallthrough]];
29658 case MVT::nxv4i32:
29659 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
29660 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
29661 if (VT.getVectorElementType() == MVT::i16)
29662 break;
29663 [[fallthrough]];
29664 case MVT::nxv8i16:
29665 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
29666 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
29667 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
29668 break;
29669 }
29670
29671 return convertFromScalableVector(DAG, VT, Val);
29672}
29673
29674SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
29675 SDValue Op, SelectionDAG &DAG) const {
29676 EVT VT = Op.getValueType();
29677 EVT InVT = Op.getOperand(0).getValueType();
29678 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
29679
29680 SDLoc DL(Op);
29681 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29682 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
29683
29684 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
29685}
29686
29687SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
29688 SDValue Op, SelectionDAG &DAG) const {
29689 EVT VT = Op.getValueType();
29690 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29691
29692 SDLoc DL(Op);
29693 EVT InVT = Op.getOperand(0).getValueType();
29694 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29695 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
29696
29697 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
29698 Op.getOperand(1), Op.getOperand(2));
29699
29700 return convertFromScalableVector(DAG, VT, ScalableRes);
29701}
29702
29703// Convert vector operation 'Op' to an equivalent predicated operation whereby
29704// the original operation's type is used to construct a suitable predicate.
29705// NOTE: The results for inactive lanes are undefined.
29706SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
29707 SelectionDAG &DAG,
29708 unsigned NewOp) const {
29709 EVT VT = Op.getValueType();
29710 SDLoc DL(Op);
29711 auto Pg = getPredicateForVector(DAG, DL, VT);
29712
29713 if (VT.isFixedLengthVector()) {
29714 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
29715 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29716
29717 // Create list of operands by converting existing ones to scalable types.
29719 for (const SDValue &V : Op->op_values()) {
29720 if (isa<CondCodeSDNode>(V)) {
29721 Operands.push_back(V);
29722 continue;
29723 }
29724
29725 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
29726 EVT VTArg = VTNode->getVT().getVectorElementType();
29727 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
29728 Operands.push_back(DAG.getValueType(NewVTArg));
29729 continue;
29730 }
29731
29732 assert(isTypeLegal(V.getValueType()) &&
29733 "Expected only legal fixed-width types");
29734 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
29735 }
29736
29737 if (isMergePassthruOpcode(NewOp))
29738 Operands.push_back(DAG.getUNDEF(ContainerVT));
29739
29740 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
29741 return convertFromScalableVector(DAG, VT, ScalableRes);
29742 }
29743
29744 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
29745
29747 for (const SDValue &V : Op->op_values()) {
29748 assert((!V.getValueType().isVector() ||
29749 V.getValueType().isScalableVector()) &&
29750 "Only scalable vectors are supported!");
29751 Operands.push_back(V);
29752 }
29753
29754 if (isMergePassthruOpcode(NewOp))
29755 Operands.push_back(DAG.getUNDEF(VT));
29756
29757 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
29758}
29759
29760// If a fixed length vector operation has no side effects when applied to
29761// undefined elements, we can safely use scalable vectors to perform the same
29762// operation without needing to worry about predication.
29763SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
29764 SelectionDAG &DAG) const {
29765 EVT VT = Op.getValueType();
29767 "Only expected to lower fixed length vector operation!");
29768 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29769
29770 // Create list of operands by converting existing ones to scalable types.
29772 for (const SDValue &V : Op->op_values()) {
29773 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
29774
29775 // Pass through non-vector operands.
29776 if (!V.getValueType().isVector()) {
29777 Ops.push_back(V);
29778 continue;
29779 }
29780
29781 // "cast" fixed length vector to a scalable vector.
29782 assert(V.getValueType().isFixedLengthVector() &&
29783 isTypeLegal(V.getValueType()) &&
29784 "Only fixed length vectors are supported!");
29785 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
29786 }
29787
29788 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
29789 return convertFromScalableVector(DAG, VT, ScalableRes);
29790}
29791
29792SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
29793 SelectionDAG &DAG) const {
29794 SDLoc DL(ScalarOp);
29795 SDValue AccOp = ScalarOp.getOperand(0);
29796 SDValue VecOp = ScalarOp.getOperand(1);
29797 EVT SrcVT = VecOp.getValueType();
29798 EVT ResVT = SrcVT.getVectorElementType();
29799
29800 EVT ContainerVT = SrcVT;
29801 if (SrcVT.isFixedLengthVector()) {
29802 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
29803 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
29804 }
29805
29806 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
29807 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29808
29809 // Convert operands to Scalable.
29810 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
29811 DAG.getUNDEF(ContainerVT), AccOp, Zero);
29812
29813 // Perform reduction.
29814 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
29815 Pg, AccOp, VecOp);
29816
29817 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
29818}
29819
29820SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
29821 SelectionDAG &DAG) const {
29822 SDLoc DL(ReduceOp);
29823 SDValue Op = ReduceOp.getOperand(0);
29824 EVT OpVT = Op.getValueType();
29825 EVT VT = ReduceOp.getValueType();
29826
29827 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
29828 return SDValue();
29829
29830 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
29831
29832 switch (ReduceOp.getOpcode()) {
29833 default:
29834 return SDValue();
29835 case ISD::VECREDUCE_OR:
29836 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
29837 // The predicate can be 'Op' because
29838 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
29839 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
29840 else
29841 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
29842 case ISD::VECREDUCE_AND: {
29843 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
29844 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
29845 }
29846 case ISD::VECREDUCE_XOR: {
29847 SDValue ID =
29848 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
29849 if (OpVT == MVT::nxv1i1) {
29850 // Emulate a CNTP on .Q using .D and a different governing predicate.
29851 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
29852 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
29853 }
29854 SDValue Cntp =
29855 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
29856 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
29857 }
29858 }
29859
29860 return SDValue();
29861}
29862
29863SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
29864 SDValue ScalarOp,
29865 SelectionDAG &DAG) const {
29866 SDLoc DL(ScalarOp);
29867 SDValue VecOp = ScalarOp.getOperand(0);
29868 EVT SrcVT = VecOp.getValueType();
29869
29871 SrcVT,
29872 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
29873 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
29874 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
29875 }
29876
29877 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
29878 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
29879 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
29880 SDValue BoolVec = VecOp.getOperand(0);
29881 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
29882 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
29883 SDValue CntpOp = DAG.getNode(
29884 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
29885 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
29886 BoolVec, BoolVec);
29887 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
29888 }
29889 }
29890
29891 // UADDV always returns an i64 result.
29892 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
29893 SrcVT.getVectorElementType();
29894 EVT RdxVT = SrcVT;
29895 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
29896 RdxVT = getPackedSVEVectorVT(ResVT);
29897
29898 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
29899 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
29900 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
29901 Rdx, DAG.getConstant(0, DL, MVT::i64));
29902
29903 // The VEC_REDUCE nodes expect an element size result.
29904 if (ResVT != ScalarOp.getValueType())
29905 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
29906
29907 return Res;
29908}
29909
29910SDValue
29911AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
29912 SelectionDAG &DAG) const {
29913 EVT VT = Op.getValueType();
29914 SDLoc DL(Op);
29915
29916 EVT InVT = Op.getOperand(1).getValueType();
29917 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29918 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
29919 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
29920
29921 // Convert the mask to a predicated (NOTE: We don't need to worry about
29922 // inactive lanes since VSELECT is safe when given undefined elements).
29923 EVT MaskVT = Op.getOperand(0).getValueType();
29924 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
29925 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
29927 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
29928
29929 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
29930 Mask, Op1, Op2);
29931
29932 return convertFromScalableVector(DAG, VT, ScalableRes);
29933}
29934
29935SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
29936 SDValue Op, SelectionDAG &DAG) const {
29937 SDLoc DL(Op);
29938 EVT InVT = Op.getOperand(0).getValueType();
29939 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29940
29941 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
29942 "Only expected to lower fixed length vector operation!");
29943 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
29944 "Expected integer result of the same bit length as the inputs!");
29945
29946 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29947 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
29948 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
29949
29950 EVT CmpVT = Pg.getValueType();
29951 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
29952 {Pg, Op1, Op2, Op.getOperand(2)});
29953
29954 EVT PromoteVT = ContainerVT.changeTypeToInteger();
29955 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
29956 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
29957}
29958
29959SDValue
29960AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
29961 SelectionDAG &DAG) const {
29962 SDLoc DL(Op);
29963 auto SrcOp = Op.getOperand(0);
29964 EVT VT = Op.getValueType();
29965 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29966 EVT ContainerSrcVT =
29967 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
29968
29969 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
29970 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
29971 return convertFromScalableVector(DAG, VT, Op);
29972}
29973
29974SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
29975 SDValue Op, SelectionDAG &DAG) const {
29976 SDLoc DL(Op);
29977 unsigned NumOperands = Op->getNumOperands();
29978
29979 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
29980 "Unexpected number of operands in CONCAT_VECTORS");
29981
29982 auto SrcOp1 = Op.getOperand(0);
29983 auto SrcOp2 = Op.getOperand(1);
29984 EVT VT = Op.getValueType();
29985 EVT SrcVT = SrcOp1.getValueType();
29986
29987 // Match a splat of 128b segments that fit in a single register.
29988 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
29989 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29990 SDValue Splat =
29991 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
29992 convertToScalableVector(DAG, ContainerVT, SrcOp1),
29993 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
29994 return convertFromScalableVector(DAG, VT, Splat);
29995 }
29996
29997 if (NumOperands > 2) {
29999 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
30000 for (unsigned I = 0; I < NumOperands; I += 2)
30001 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
30002 Op->getOperand(I), Op->getOperand(I + 1)));
30003
30004 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
30005 }
30006
30007 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30008
30010 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
30011 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
30012
30013 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
30014
30015 return convertFromScalableVector(DAG, VT, Op);
30016}
30017
30018SDValue
30019AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
30020 SelectionDAG &DAG) const {
30021 EVT VT = Op.getValueType();
30022 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30023
30024 SDLoc DL(Op);
30025 SDValue Val = Op.getOperand(0);
30026 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30027 EVT SrcVT = Val.getValueType();
30028 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30029 EVT ExtendVT = ContainerVT.changeVectorElementType(
30030 SrcVT.getVectorElementType());
30031
30032 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30033 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
30034
30035 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
30036 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
30037 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
30038 Pg, Val, DAG.getUNDEF(ContainerVT));
30039
30040 return convertFromScalableVector(DAG, VT, Val);
30041}
30042
30043SDValue
30044AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
30045 SelectionDAG &DAG) const {
30046 EVT VT = Op.getValueType();
30047 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30048
30049 SDLoc DL(Op);
30050 SDValue Val = Op.getOperand(0);
30051 EVT SrcVT = Val.getValueType();
30052 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30053 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
30055 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
30056
30057 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30058 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
30059 Op.getOperand(1), DAG.getUNDEF(RoundVT));
30060 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
30061 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30062
30063 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30064 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30065}
30066
30067SDValue
30068AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
30069 SelectionDAG &DAG) const {
30070 EVT VT = Op.getValueType();
30071 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30072
30073 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
30074 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
30075 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
30076
30077 SDLoc DL(Op);
30078 SDValue Val = Op.getOperand(0);
30079 EVT SrcVT = Val.getValueType();
30080 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30081 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30082
30083 if (VT.bitsGE(SrcVT)) {
30085
30086 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
30087 VT.changeTypeToInteger(), Val);
30088
30089 // Safe to use a larger than specified operand because by promoting the
30090 // value nothing has changed from an arithmetic point of view.
30091 Val =
30092 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
30093 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30094 DAG.getUNDEF(ContainerDstVT));
30095 return convertFromScalableVector(DAG, VT, Val);
30096 } else {
30097 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
30098 ContainerDstVT.getVectorElementType());
30100
30101 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30102 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30103 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
30104 Val = convertFromScalableVector(DAG, SrcVT, Val);
30105
30106 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30107 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30108 }
30109}
30110
30111SDValue
30112AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
30113 SelectionDAG &DAG) const {
30114 SDLoc DL(Op);
30115 EVT OpVT = Op.getValueType();
30116 assert(OpVT.isScalableVector() &&
30117 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
30118
30119 // Are multi-register uzp instructions available?
30120 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30121 OpVT.getVectorElementType() != MVT::i1) {
30122 Intrinsic::ID IntID;
30123 switch (Op->getNumOperands()) {
30124 default:
30125 return SDValue();
30126 case 2:
30127 IntID = Intrinsic::aarch64_sve_uzp_x2;
30128 break;
30129 case 4:
30130 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30131 OpVT.getScalarSizeInBits() == 64)
30132 return SDValue();
30133 IntID = Intrinsic::aarch64_sve_uzp_x4;
30134 break;
30135 }
30136
30138 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30139 Ops.append(Op->op_values().begin(), Op->op_values().end());
30140 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30141 }
30142
30143 if (Op->getNumOperands() != 2)
30144 return SDValue();
30145
30146 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
30147 Op.getOperand(1));
30148 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
30149 Op.getOperand(1));
30150 return DAG.getMergeValues({Even, Odd}, DL);
30151}
30152
30153SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
30154 SelectionDAG &DAG) const {
30155 SDLoc DL(Op);
30156 EVT OpVT = Op.getValueType();
30157 assert(OpVT.isScalableVector() &&
30158 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
30159
30160 // Are multi-register zip instructions available?
30161 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30162 OpVT.getVectorElementType() != MVT::i1) {
30163 Intrinsic::ID IntID;
30164 switch (Op->getNumOperands()) {
30165 default:
30166 return SDValue();
30167 case 2:
30168 IntID = Intrinsic::aarch64_sve_zip_x2;
30169 break;
30170 case 4:
30171 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30172 OpVT.getScalarSizeInBits() == 64)
30173 return SDValue();
30174 IntID = Intrinsic::aarch64_sve_zip_x4;
30175 break;
30176 }
30177
30179 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30180 Ops.append(Op->op_values().begin(), Op->op_values().end());
30181 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30182 }
30183
30184 if (Op->getNumOperands() != 2)
30185 return SDValue();
30186
30187 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
30188 Op.getOperand(1));
30189 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
30190 Op.getOperand(1));
30191 return DAG.getMergeValues({Lo, Hi}, DL);
30192}
30193
30194SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
30195 SelectionDAG &DAG) const {
30196 // FIXME: Maybe share some code with LowerMGather/Scatter?
30197 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
30198 SDLoc DL(HG);
30199 SDValue Chain = HG->getChain();
30200 SDValue Inc = HG->getInc();
30201 SDValue Mask = HG->getMask();
30202 SDValue Ptr = HG->getBasePtr();
30203 SDValue Index = HG->getIndex();
30204 SDValue Scale = HG->getScale();
30205 SDValue IntID = HG->getIntID();
30206
30207 // The Intrinsic ID determines the type of update operation.
30208 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
30209 // Right now, we only support 'add' as an update.
30210 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
30211 "Unexpected histogram update operation");
30212
30213 EVT IndexVT = Index.getValueType();
30214 LLVMContext &Ctx = *DAG.getContext();
30216 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
30217 EVT IncExtVT =
30218 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
30219 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
30220 bool ExtTrunc = IncSplatVT != MemVT;
30221
30222 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30223 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
30224 SDValue IncSplat = DAG.getSplatVector(
30225 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
30226 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
30227
30228 MachineMemOperand *MMO = HG->getMemOperand();
30229 // Create an MMO for the gather, without load|store flags.
30232 MMO->getAlign(), MMO->getAAInfo());
30233 ISD::MemIndexType IndexType = HG->getIndexType();
30234 SDValue Gather = DAG.getMaskedGather(
30235 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
30236 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
30237
30238 SDValue GChain = Gather.getValue(1);
30239
30240 // Perform the histcnt, multiply by inc, add to bucket data.
30241 SDValue ID =
30242 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
30243 SDValue HistCnt =
30244 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
30245 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
30246 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
30247
30248 // Create an MMO for the scatter, without load|store flags.
30251 MMO->getAlign(), MMO->getAAInfo());
30252
30253 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
30254 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
30255 ScatterOps, SMMO, IndexType, ExtTrunc);
30256 return Scatter;
30257}
30258
30259/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
30260/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
30261/// however still make use of the dot product instruction by instead
30262/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
30263/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
30264/// the following pattern is emitted:
30265/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
30266/// NTy/2))))
30267SDValue
30268AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
30269 SelectionDAG &DAG) const {
30270 SDLoc DL(Op);
30271
30272 SDValue Acc = Op.getOperand(0);
30273 SDValue LHS = Op.getOperand(1);
30274 SDValue RHS = Op.getOperand(2);
30275 EVT ResultVT = Op.getValueType();
30276 EVT OrigResultVT = ResultVT;
30277 EVT OpVT = LHS.getValueType();
30278
30279 bool ConvertToScalable =
30280 ResultVT.isFixedLengthVector() &&
30281 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
30282
30283 if (ConvertToScalable) {
30284 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
30285 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
30286 Acc = convertToScalableVector(DAG, ResultVT, Acc);
30287 LHS = convertToScalableVector(DAG, OpVT, LHS);
30288 RHS = convertToScalableVector(DAG, OpVT, RHS);
30289 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
30290 }
30291
30292 // Two-way and four-way partial reductions are supported by patterns.
30293 // We only need to handle the 8-way partial reduction.
30294 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
30295 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
30296 : Op;
30297
30298 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
30299 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
30300 DAG.getConstant(0, DL, DotVT), LHS, RHS);
30301
30302 SDValue Res;
30303 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
30304 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
30305 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
30306 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
30307 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
30308 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
30309 } else {
30310 // Fold (nx)v4i32 into (nx)v2i64
30311 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
30312 if (IsUnsigned) {
30313 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
30314 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
30315 } else {
30316 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
30317 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
30318 }
30319 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
30320 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
30321 }
30322
30323 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
30324 : Res;
30325}
30326
30327SDValue
30328AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
30329 SelectionDAG &DAG) const {
30330 EVT VT = Op.getValueType();
30331 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30332
30333 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
30334 "Lowering fixed length get_active_lane_mask requires SVE!");
30335
30336 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
30337 // but we can use SVE when available.
30338
30339 SDLoc DL(Op);
30340 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30341 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
30342
30344 Op.getOperand(0), Op.getOperand(1));
30345 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
30346 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
30347 DAG.getVectorIdxConstant(0, DL));
30348}
30349
30350SDValue
30351AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
30352 SelectionDAG &DAG) const {
30353 EVT VT = Op.getValueType();
30354 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30355
30356 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
30357 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
30358 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
30359
30360 SDLoc DL(Op);
30361 SDValue Val = Op.getOperand(0);
30362 EVT SrcVT = Val.getValueType();
30363 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30364 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30365
30366 if (VT.bitsGT(SrcVT)) {
30367 EVT CvtVT = ContainerDstVT.changeVectorElementType(
30368 ContainerSrcVT.getVectorElementType());
30370
30371 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30372 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
30373
30374 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
30375 Val = getSVESafeBitCast(CvtVT, Val, DAG);
30376 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30377 DAG.getUNDEF(ContainerDstVT));
30378 return convertFromScalableVector(DAG, VT, Val);
30379 } else {
30380 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
30382
30383 // Safe to use a larger than specified result since an fp_to_int where the
30384 // result doesn't fit into the destination is undefined.
30385 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30386 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30387 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30388
30389 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
30390 }
30391}
30392
30394 ArrayRef<int> ShuffleMask, EVT VT,
30395 EVT ContainerVT, SelectionDAG &DAG) {
30396 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
30397 SDLoc DL(Op);
30398 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
30399 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
30400 bool IsSingleOp =
30401 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
30402
30403 if (!Subtarget.isNeonAvailable() && !MinSVESize)
30404 MinSVESize = 128;
30405
30406 // Ignore two operands if no SVE2 or all index numbers couldn't
30407 // be represented.
30408 if (!IsSingleOp && !Subtarget.hasSVE2())
30409 return SDValue();
30410
30411 EVT VTOp1 = Op.getOperand(0).getValueType();
30412 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
30413 unsigned IndexLen = MinSVESize / BitsPerElt;
30414 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
30415 uint64_t MaxOffset = maxUIntN(BitsPerElt);
30416 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
30417 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
30418 bool MinMaxEqual = (MinSVESize == MaxSVESize);
30419 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
30420 "Incorrectly legalised shuffle operation");
30421
30423 // If MinSVESize is not equal to MaxSVESize then we need to know which
30424 // TBL mask element needs adjustment.
30425 SmallVector<SDValue, 8> AddRuntimeVLMask;
30426
30427 // Bail out for 8-bits element types, because with 2048-bit SVE register
30428 // size 8 bits is only sufficient to index into the first source vector.
30429 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
30430 return SDValue();
30431
30432 for (int Index : ShuffleMask) {
30433 // Handling poison index value.
30434 if (Index < 0)
30435 Index = 0;
30436 // If the mask refers to elements in the second operand, then we have to
30437 // offset the index by the number of elements in a vector. If this is number
30438 // is not known at compile-time, we need to maintain a mask with 'VL' values
30439 // to add at runtime.
30440 if ((unsigned)Index >= ElementsPerVectorReg) {
30441 if (MinMaxEqual) {
30442 Index += IndexLen - ElementsPerVectorReg;
30443 } else {
30444 Index = Index - ElementsPerVectorReg;
30445 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
30446 }
30447 } else if (!MinMaxEqual)
30448 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30449 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
30450 // to 255, this might point to the last element of in the second operand
30451 // of the shufflevector, thus we are rejecting this transform.
30452 if ((unsigned)Index >= MaxOffset)
30453 return SDValue();
30454 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
30455 }
30456
30457 // Choosing an out-of-range index leads to the lane being zeroed vs zero
30458 // value where it would perform first lane duplication for out of
30459 // index elements. For i8 elements an out-of-range index could be a valid
30460 // for 2048-bit vector register size.
30461 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
30462 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
30463 if (!MinMaxEqual)
30464 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30465 }
30466
30467 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
30468 SDValue VecMask =
30469 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30470 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
30471
30472 SDValue Shuffle;
30473 if (IsSingleOp)
30474 Shuffle =
30475 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30476 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
30477 Op1, SVEMask);
30478 else if (Subtarget.hasSVE2()) {
30479 if (!MinMaxEqual) {
30480 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
30481 SDValue VScale = (BitsPerElt == 64)
30482 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
30483 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
30484 SDValue VecMask =
30485 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30486 SDValue MulByMask = DAG.getNode(
30487 ISD::MUL, DL, MaskType,
30488 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
30489 DAG.getBuildVector(MaskType, DL,
30490 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
30491 SDValue UpdatedVecMask =
30492 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
30493 SVEMask = convertToScalableVector(
30494 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
30495 }
30496 Shuffle =
30497 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30498 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
30499 Op1, Op2, SVEMask);
30500 }
30501 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
30502 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
30503}
30504
30505SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
30506 SDValue Op, SelectionDAG &DAG) const {
30507 EVT VT = Op.getValueType();
30508 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30509
30510 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
30511 auto ShuffleMask = SVN->getMask();
30512
30513 SDLoc DL(Op);
30514 SDValue Op1 = Op.getOperand(0);
30515 SDValue Op2 = Op.getOperand(1);
30516
30517 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30518 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
30519 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
30520
30521 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
30522 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
30523 return MVT::i32;
30524 return ScalarTy;
30525 };
30526
30527 if (SVN->isSplat()) {
30528 unsigned Lane = std::max(0, SVN->getSplatIndex());
30529 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30530 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30531 DAG.getConstant(Lane, DL, MVT::i64));
30532 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
30533 return convertFromScalableVector(DAG, VT, Op);
30534 }
30535
30536 bool ReverseEXT = false;
30537 unsigned Imm;
30538 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
30539 Imm == VT.getVectorNumElements() - 1) {
30540 if (ReverseEXT)
30541 std::swap(Op1, Op2);
30542 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30543 SDValue Scalar = DAG.getNode(
30544 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30545 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
30546 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
30547 return convertFromScalableVector(DAG, VT, Op);
30548 }
30549
30550 unsigned EltSize = VT.getScalarSizeInBits();
30551 for (unsigned BlockSize : {64U, 32U, 16U}) {
30552 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
30553 unsigned RevOp;
30554 if (EltSize == 8)
30555 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
30556 else if (EltSize == 16)
30557 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
30558 else
30559 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
30560 EVT BlockedVT =
30562 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
30563 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
30564 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
30565 DAG.getUNDEF(BlockedVT));
30566 SDValue Container =
30567 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
30568 return convertFromScalableVector(DAG, VT, Container);
30569 }
30570 }
30571
30572 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
30573 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
30574 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30575 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
30576 Pg, Op1, DAG.getUNDEF(ContainerVT));
30577 return convertFromScalableVector(DAG, VT, Revd);
30578 }
30579
30580 unsigned WhichResult;
30581 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30582 WhichResult == 0)
30584 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
30585
30586 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30587 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30589 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30590 }
30591
30592 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
30594 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
30595
30596 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30597 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30599 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30600 }
30601
30602 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
30603 // represents the same logical operation as performed by a ZIP instruction. In
30604 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
30605 // equivalent to an AArch64 instruction. There's the extra component of
30606 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
30607 // only operated on 64/128bit vector types that have a direct mapping to a
30608 // target register and so an exact mapping is implied.
30609 // However, when using SVE for fixed length vectors, most legal vector types
30610 // are actually sub-vectors of a larger SVE register. When mapping
30611 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
30612 // how the mask's indices translate. Specifically, when the mapping requires
30613 // an exact meaning for a specific vector index (e.g. Index X is the last
30614 // vector element in the register) then such mappings are often only safe when
30615 // the exact SVE register size is know. The main exception to this is when
30616 // indices are logically relative to the first element of either
30617 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
30618 // when converting from fixed-length to scalable vector types (i.e. the start
30619 // of a fixed length vector is always the start of a scalable vector).
30620 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
30621 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
30622 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
30623 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
30624 Op2.isUndef()) {
30625 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
30626 return convertFromScalableVector(DAG, VT, Op);
30627 }
30628
30629 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30630 WhichResult != 0)
30632 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
30633
30634 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30635 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30637 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30638 }
30639
30640 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
30642 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
30643
30644 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30645 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30647 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30648 }
30649
30650 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
30651 Subtarget->isSVEorStreamingSVEAvailable()) {
30653 "Unsupported SVE vector size");
30654
30656 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
30657 if (std::optional<unsigned> Lane =
30658 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
30659 SDValue IID =
30660 DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
30662 DAG, VT,
30663 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30664 {IID, Op1,
30665 DAG.getConstant(*Lane, DL, MVT::i64,
30666 /*isTarget=*/true)}));
30667 }
30668 }
30669 }
30670
30671 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
30672 // This may allow the shuffle to be matched as something cheaper like ZIP1.
30673 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
30674 return WideOp;
30675
30676 // Avoid producing TBL instruction if we don't know SVE register minimal size,
30677 // unless NEON is not available and we can assume minimal SVE register size is
30678 // 128-bits.
30679 if (MinSVESize || !Subtarget->isNeonAvailable())
30680 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
30681 DAG);
30682
30683 return SDValue();
30684}
30685
30686SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
30687 SelectionDAG &DAG) const {
30688 SDLoc DL(Op);
30689 EVT InVT = Op.getValueType();
30690
30691 assert(VT.isScalableVector() && isTypeLegal(VT) &&
30692 InVT.isScalableVector() && isTypeLegal(InVT) &&
30693 "Only expect to cast between legal scalable vector types!");
30694 assert(VT.getVectorElementType() != MVT::i1 &&
30695 InVT.getVectorElementType() != MVT::i1 &&
30696 "For predicate bitcasts, use getSVEPredicateBitCast");
30697
30698 if (InVT == VT)
30699 return Op;
30700
30702 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
30703
30704 // Safe bitcasting between unpacked vector types of different element counts
30705 // is currently unsupported because the following is missing the necessary
30706 // work to ensure the result's elements live where they're supposed to within
30707 // an SVE register.
30708 // 01234567
30709 // e.g. nxv2i32 = XX??XX??
30710 // nxv4f16 = X?X?X?X?
30712 VT == PackedVT || InVT == PackedInVT) &&
30713 "Unexpected bitcast!");
30714
30715 // Pack input if required.
30716 if (InVT != PackedInVT)
30717 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
30718
30719 if (Subtarget->isLittleEndian() ||
30720 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
30721 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
30722 else {
30723 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
30724 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
30725
30726 // Simulate the effect of casting through memory.
30727 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
30728 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
30729 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
30730 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
30731 if (PackedVTAsInt.getScalarSizeInBits() != 8)
30732 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
30733 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
30734 }
30735
30736 // Unpack result if required.
30737 if (VT != PackedVT)
30738 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
30739
30740 return Op;
30741}
30742
30744 SDValue N) const {
30745 return ::isAllActivePredicate(DAG, N);
30746}
30747
30749 return ::getPromotedVTForPredicate(VT);
30750}
30751
30752bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
30753 SDValue Op, const APInt &OriginalDemandedBits,
30754 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
30755 unsigned Depth) const {
30756
30757 unsigned Opc = Op.getOpcode();
30758 switch (Opc) {
30759 case AArch64ISD::VSHL: {
30760 // Match (VSHL (VLSHR Val X) X)
30761 SDValue ShiftL = Op;
30762 SDValue ShiftR = Op->getOperand(0);
30763 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
30764 return false;
30765
30766 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
30767 return false;
30768
30769 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
30770 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
30771
30772 // Other cases can be handled as well, but this is not
30773 // implemented.
30774 if (ShiftRBits != ShiftLBits)
30775 return false;
30776
30777 unsigned ScalarSize = Op.getScalarValueSizeInBits();
30778 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
30779
30780 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
30781 APInt UnusedBits = ~OriginalDemandedBits;
30782
30783 if ((ZeroBits & UnusedBits) != ZeroBits)
30784 return false;
30785
30786 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
30787 // used - simplify to just Val.
30788 return TLO.CombineTo(Op, ShiftR->getOperand(0));
30789 }
30790 case AArch64ISD::BICi: {
30791 // Fold BICi if all destination bits already known to be zeroed
30792 SDValue Op0 = Op.getOperand(0);
30793 KnownBits KnownOp0 =
30794 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
30795 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
30796 APInt BitsToClear =
30797 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
30798 .trunc(KnownOp0.getBitWidth());
30799 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
30800 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
30801 return TLO.CombineTo(Op, Op0);
30802
30803 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
30804 return false;
30805 }
30807 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
30808 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
30809 if (!MaxSVEVectorSizeInBits)
30810 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
30811 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
30812 // The SVE count intrinsics don't support the multiplier immediate so we
30813 // don't have to account for that here. The value returned may be slightly
30814 // over the true required bits, as this is based on the "ALL" pattern. The
30815 // other patterns are also exposed by these intrinsics, but they all
30816 // return a value that's strictly less than "ALL".
30817 unsigned RequiredBits = llvm::bit_width(MaxElements);
30818 unsigned BitWidth = Known.Zero.getBitWidth();
30819 if (RequiredBits < BitWidth)
30820 Known.Zero.setHighBits(BitWidth - RequiredBits);
30821 return false;
30822 }
30823 }
30824 }
30825
30827 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
30828}
30829
30830bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
30831 return Op.getOpcode() == AArch64ISD::DUP ||
30832 Op.getOpcode() == AArch64ISD::MOVI ||
30833 Op.getOpcode() == AArch64ISD::MOVIshift ||
30834 Op.getOpcode() == AArch64ISD::MOVImsl ||
30835 Op.getOpcode() == AArch64ISD::MOVIedit ||
30836 Op.getOpcode() == AArch64ISD::MVNIshift ||
30837 Op.getOpcode() == AArch64ISD::MVNImsl ||
30838 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
30839 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
30840 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
30841 (Op.getOpcode() == ISD::FNEG &&
30842 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
30843 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
30844 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
30845 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
30847}
30848
30850 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
30851 Subtarget->hasComplxNum();
30852}
30853
30856 auto *VTy = dyn_cast<VectorType>(Ty);
30857 if (!VTy)
30858 return false;
30859
30860 // If the vector is scalable, SVE is enabled, implying support for complex
30861 // numbers. Otherwise, we need to ensure complex number support is available
30862 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
30863 return false;
30864
30865 auto *ScalarTy = VTy->getScalarType();
30866 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
30867
30868 // We can only process vectors that have a bit size of 128 or higher (with an
30869 // additional 64 bits for Neon). Additionally, these vectors must have a
30870 // power-of-2 size, as we later split them into the smallest supported size
30871 // and merging them back together after applying complex operation.
30872 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
30873 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
30874 !llvm::isPowerOf2_32(VTyWidth))
30875 return false;
30876
30877 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
30878 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
30879
30881 return ScalarWidth == 32 || ScalarWidth == 64;
30882 return 8 <= ScalarWidth && ScalarWidth <= 64;
30883 }
30884
30885 // CDot is not supported outside of scalable/sve scopes
30887 return false;
30888
30889 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
30890 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
30891}
30892
30895 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
30896 Value *Accumulator) const {
30897 VectorType *Ty = cast<VectorType>(InputA->getType());
30898 if (Accumulator == nullptr)
30900 bool IsScalable = Ty->isScalableTy();
30901 bool IsInt = Ty->getElementType()->isIntegerTy();
30902
30903 unsigned TyWidth =
30905
30906 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
30907 "Vector type must be either 64 or a power of 2 that is at least 128");
30908
30909 if (TyWidth > 128) {
30910 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
30911 int AccStride = cast<VectorType>(Accumulator->getType())
30912 ->getElementCount()
30913 .getKnownMinValue() /
30914 2;
30915 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
30916 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
30917 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
30918 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
30919 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
30920 Value *LowerSplitAcc = nullptr;
30921 Value *UpperSplitAcc = nullptr;
30922 Type *FullTy = Ty;
30923 FullTy = Accumulator->getType();
30924 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
30925 cast<VectorType>(Accumulator->getType()));
30926 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
30927 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
30928 auto *LowerSplitInt = createComplexDeinterleavingIR(
30929 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
30930 auto *UpperSplitInt = createComplexDeinterleavingIR(
30931 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
30932
30933 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
30934 LowerSplitInt, uint64_t(0));
30935 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
30936 }
30937
30938 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
30939 if (IsScalable) {
30940 if (IsInt)
30941 return B.CreateIntrinsic(
30942 Intrinsic::aarch64_sve_cmla_x, Ty,
30943 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
30944
30945 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
30946 return B.CreateIntrinsic(
30947 Intrinsic::aarch64_sve_fcmla, Ty,
30948 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
30949 }
30950
30951 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
30952 Intrinsic::aarch64_neon_vcmla_rot90,
30953 Intrinsic::aarch64_neon_vcmla_rot180,
30954 Intrinsic::aarch64_neon_vcmla_rot270};
30955
30956
30957 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
30958 {Accumulator, InputA, InputB});
30959 }
30960
30961 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
30962 if (IsScalable) {
30965 if (IsInt)
30966 return B.CreateIntrinsic(
30967 Intrinsic::aarch64_sve_cadd_x, Ty,
30968 {InputA, InputB, B.getInt32((int)Rotation * 90)});
30969
30970 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
30971 return B.CreateIntrinsic(
30972 Intrinsic::aarch64_sve_fcadd, Ty,
30973 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
30974 }
30975 return nullptr;
30976 }
30977
30980 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
30982 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
30983
30984 if (IntId == Intrinsic::not_intrinsic)
30985 return nullptr;
30986
30987 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
30988 }
30989
30990 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
30991 IsScalable) {
30992 return B.CreateIntrinsic(
30993 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
30994 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
30995 }
30996
30997 return nullptr;
30998}
30999
31000bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
31001 unsigned Opc = N->getOpcode();
31002 if (ISD::isExtOpcode(Opc)) {
31003 if (any_of(N->users(),
31004 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
31005 return false;
31006 }
31007 return true;
31008}
31009
31010unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
31011 return Subtarget->getMinimumJumpTableEntries();
31012}
31013
31015 CallingConv::ID CC,
31016 EVT VT) const {
31017 bool NonUnitFixedLengthVector =
31019 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31021
31022 EVT VT1;
31023 MVT RegisterVT;
31024 unsigned NumIntermediates;
31025 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
31026 RegisterVT);
31027 return RegisterVT;
31028}
31029
31031 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
31032 bool NonUnitFixedLengthVector =
31034 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31036
31037 EVT VT1;
31038 MVT VT2;
31039 unsigned NumIntermediates;
31041 NumIntermediates, VT2);
31042}
31043
31045 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
31046 unsigned &NumIntermediates, MVT &RegisterVT) const {
31048 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
31049 if (!RegisterVT.isFixedLengthVector() ||
31050 RegisterVT.getFixedSizeInBits() <= 128)
31051 return NumRegs;
31052
31053 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
31054 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
31055 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
31056
31057 // A size mismatch here implies either type promotion or widening and would
31058 // have resulted in scalarisation if larger vectors had not be available.
31059 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
31060 EVT EltTy = VT.getVectorElementType();
31062 if (!isTypeLegal(NewVT))
31063 NewVT = EltTy;
31064
31065 IntermediateVT = NewVT;
31066 NumIntermediates = VT.getVectorNumElements();
31067 RegisterVT = getRegisterType(Context, NewVT);
31068 return NumIntermediates;
31069 }
31070
31071 // SVE VLS support does not introduce a new ABI so we should use NEON sized
31072 // types for vector arguments and returns.
31073
31074 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
31075 NumIntermediates *= NumSubRegs;
31076 NumRegs *= NumSubRegs;
31077
31078 switch (RegisterVT.getVectorElementType().SimpleTy) {
31079 default:
31080 llvm_unreachable("unexpected element type for vector");
31081 case MVT::i8:
31082 IntermediateVT = RegisterVT = MVT::v16i8;
31083 break;
31084 case MVT::i16:
31085 IntermediateVT = RegisterVT = MVT::v8i16;
31086 break;
31087 case MVT::i32:
31088 IntermediateVT = RegisterVT = MVT::v4i32;
31089 break;
31090 case MVT::i64:
31091 IntermediateVT = RegisterVT = MVT::v2i64;
31092 break;
31093 case MVT::f16:
31094 IntermediateVT = RegisterVT = MVT::v8f16;
31095 break;
31096 case MVT::f32:
31097 IntermediateVT = RegisterVT = MVT::v4f32;
31098 break;
31099 case MVT::f64:
31100 IntermediateVT = RegisterVT = MVT::v2f64;
31101 break;
31102 case MVT::bf16:
31103 IntermediateVT = RegisterVT = MVT::v8bf16;
31104 break;
31105 }
31106
31107 return NumRegs;
31108}
31109
31111 const MachineFunction &MF) const {
31112 return !Subtarget->isTargetWindows() &&
31113 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
31114}
31115
31117 switch (Opc) {
31121 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
31122 return true;
31123 }
31124
31126}
31127
31129 EVT VT) const {
31130 return Subtarget->hasCPA() && UseFEATCPACodegen;
31131}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
SDValue tryLowerPartialReductionToWideAdd(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static unsigned getFPSubregForVT(EVT VT)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static SDValue combineStoreValueFPToInt(StoreSDNode *ST, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
Register const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setSMESaveBufferUsed(bool Used=true)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
std::optional< uint16_t > getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const
Compute the integer discriminator for a given BlockAddress constant, if blockaddress signing is enabl...
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
bool isStreamingSVEAvailable() const
Returns true if the target has access to the streaming-compatible subset of SVE instructions.
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool isNonStreamingSVEorSME2Available() const
Returns true if the target has access to either the full range of SVE instructions,...
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
const char * getChkStkName() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool hasCustomCallingConv() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override
Return true if the @llvm.experimental.vector.partial.reduce.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1890
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1033
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1928
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1166
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1935
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1041
unsigned logBase2() const
Definition: APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:985
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
an instruction to allocate memory on the stack
Definition: Instructions.h:64
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
Definition: Instructions.h:765
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ And
*p = old & v
Definition: Instructions.h:729
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
Definition: Instructions.h:761
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:757
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:753
@ Nand
*p = ~(old & v)
Definition: Instructions.h:731
bool isFloatingPointOperation() const
Definition: Instructions.h:898
BinOp getOperation() const
Definition: Instructions.h:819
This is an SDNode representing atomic operations.
LLVM_ABI bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
const BlockAddress * getBlockAddress() const
The address of a basic block.
Definition: Constants.h:899
Function * getFunction() const
Definition: Constants.h:935
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
unsigned arg_size() const
Definition: InstrTypes.h:1290
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1506
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:198
bool isBigEndian() const
Definition: DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
A debug info location.
Definition: DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:187
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:315
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:323
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition: DerivedTypes.h:604
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:706
bool empty() const
Definition: Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1036
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
arg_iterator arg_end()
Definition: Function.h:875
arg_iterator arg_begin()
Definition: Function.h:866
size_t size() const
Definition: Function.h:856
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:265
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:531
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:132
Type * getValueType() const
Definition: GlobalValue.h:298
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2214
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1093
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1936
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2625
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1107
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2251
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1115
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2618
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:502
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2128
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:512
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2142
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:533
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
LLVMContext & getContext() const
Definition: IRBuilder.h:203
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2194
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1532
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:56
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:265
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:101
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:201
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:180
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:56
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:710
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:352
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:146
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
bool isAssert() const
Test if this node is an assert operation.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
bool hasZT0State() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:639
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:825
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
Definition: SelectionDAG.h:941
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold=true)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:506
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:902
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:808
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:885
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:777
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:176
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:806
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:287
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:34
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:480
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:581
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:269
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition: StringRef.h:619
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:694
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:281
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
Class to represent struct types.
Definition: DerivedTypes.h:218
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:687
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:264
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
@ HalfTyID
16-bit floating point type
Definition: Type.h:56
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition: Type.h:57
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:311
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5465
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:534
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:481
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:499
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:255
self_iterator getIterator()
Definition: ilist_node.h:134
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:271
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition: CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:504
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1386
@ PARTIAL_REDUCE_SMLA
Definition: ISDOpcodes.h:1510
@ LOOP_DEPENDENCE_RAW_MASK
Definition: ISDOpcodes.h:1565
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1458
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1401
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1491
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1265
@ ConstantFP
Definition: ISDOpcodes.h:87
@ STRICT_FATAN2
Definition: ISDOpcodes.h:441
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ STRICT_FCEIL
Definition: ISDOpcodes.h:454
@ STRICT_FTANH
Definition: ISDOpcodes.h:444
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1131
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ VECTOR_FIND_LAST_ACTIVE
Definition: ISDOpcodes.h:1550
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
Definition: ISDOpcodes.h:1098
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:1020
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
Definition: ISDOpcodes.h:1094
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1476
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1480
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1135
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1490
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:505
@ STRICT_FLOG2
Definition: ISDOpcodes.h:449
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1309
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1574
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1384
@ GlobalTLSAddress
Definition: ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ PARTIAL_REDUCE_UMLA
Definition: ISDOpcodes.h:1511
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ STRICT_FASIN
Definition: ISDOpcodes.h:438
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:117
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1473
@ STRICT_FATAN
Definition: ISDOpcodes.h:440
@ WRITE_REGISTER
Definition: ISDOpcodes.h:135
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ TRUNCATE_SSAT_U
Definition: ISDOpcodes.h:855
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1477
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:809
@ STRICT_LROUND
Definition: ISDOpcodes.h:459
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1162
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition: ISDOpcodes.h:622
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:682
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ STRICT_FPOWI
Definition: ISDOpcodes.h:433
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1492
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:663
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1261
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1387
@ GET_ACTIVE_LANE_MASK
Definition: ISDOpcodes.h:1559
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:458
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1485
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ ATOMIC_LOAD_FMAXIMUM
Definition: ISDOpcodes.h:1388
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1126
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1376
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:463
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:48
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:452
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:928
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:453
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ STRICT_FSINH
Definition: ISDOpcodes.h:442
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1448
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1358
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1325
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ ATOMIC_LOAD_FMINIMUM
Definition: ISDOpcodes.h:1389
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ STRICT_LRINT
Definition: ISDOpcodes.h:461
@ ConstantPool
Definition: ISDOpcodes.h:92
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:627
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ STRICT_FROUND
Definition: ISDOpcodes.h:456
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:477
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1413
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1493
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:455
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:110
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ STRICT_FCOSH
Definition: ISDOpcodes.h:443
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:420
@ STRICT_FLOG10
Definition: ISDOpcodes.h:448
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ STRICT_LLRINT
Definition: ISDOpcodes.h:462
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:648
@ STRICT_FEXP2
Definition: ISDOpcodes.h:446
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:690
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:122
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ STRICT_LLROUND
Definition: ISDOpcodes.h:460
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:903
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1546
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:927
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1481
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1256
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ BlockAddress
Definition: ISDOpcodes.h:94
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ PARTIAL_REDUCE_SUMLA
Definition: ISDOpcodes.h:1512
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ STRICT_FRINT
Definition: ISDOpcodes.h:450
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition: ISDOpcodes.h:611
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition: ISDOpcodes.h:853
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:713
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1315
@ TRUNCATE_USAT_U
Definition: ISDOpcodes.h:857
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ STRICT_FACOS
Definition: ISDOpcodes.h:439
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
@ LOOP_DEPENDENCE_WAR_MASK
Definition: ISDOpcodes.h:1564
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1762
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1653
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1640
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1691
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1671
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1642
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition: ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:216
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:355
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:270
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:252
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1587
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition: Error.cpp:177
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
unsigned M1(unsigned Val)
Definition: VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:270
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2139
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:257
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2127
static const MachineMemOperand::Flags MOStridedAccess
@ Default
The result values are uniform if and only if all operands are uniform.
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition: Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:324
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:397
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:439
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:187
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:427
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:154
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:370
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:304
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:803
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition: KnownBits.h:128
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:285
Matching combinators.
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:249
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64